Upload folder using huggingface_hub
Browse files- main.py +99 -48
- models/anomaly-detection/main.py +9 -9
- models/anomaly-detection/src/pipeline/train.py +44 -0
- models/currency-volatility-prediction/main.py +7 -5
- models/currency-volatility-prediction/src/pipeline/train.py +58 -0
- models/stock-price-prediction/main.py +7 -5
- models/stock-price-prediction/src/pipeline/train.py +63 -0
- models/weather-prediction/main.py +7 -5
- models/weather-prediction/pyproject.toml +1 -0
- models/weather-prediction/src/components/data_ingestion.py +65 -0
- models/weather-prediction/src/pipeline/train.py +55 -0
- models/weather-prediction/src/utils/tutiempo_scraper.py +18 -5
- models/weather-prediction/uv.lock +44 -0
- requirements.txt +2 -0
- src/config/intel_config.json +82 -69
- src/nodes/intelligenceAgentNode.py +114 -80
- src/nodes/socialAgentNode.py +128 -89
- src/utils/utils.py +36 -25
main.py
CHANGED
|
@@ -727,21 +727,57 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
|
|
| 727 |
# Get recent feeds
|
| 728 |
feeds = storage_manager.get_recent_feeds(limit=100)
|
| 729 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
if not _load_anomaly_components():
|
| 731 |
-
#
|
| 732 |
-
anomalies = [
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
return {
|
| 738 |
-
"anomalies": anomalies,
|
| 739 |
"total": len(anomalies),
|
| 740 |
-
"
|
| 741 |
-
"
|
|
|
|
| 742 |
}
|
| 743 |
|
| 744 |
-
#
|
| 745 |
anomalies = []
|
| 746 |
for feed in feeds:
|
| 747 |
summary = feed.get("summary", "")
|
|
@@ -758,13 +794,13 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
|
|
| 758 |
else:
|
| 759 |
score = 1.0 if prediction == -1 else 0.0
|
| 760 |
|
| 761 |
-
# Normalize score to 0-1 range
|
| 762 |
normalized_score = max(0, min(1, (score + 0.5)))
|
| 763 |
|
| 764 |
if prediction == -1 or normalized_score >= threshold:
|
| 765 |
anomalies.append({
|
| 766 |
**feed,
|
| 767 |
-
"anomaly_score": float(normalized_score),
|
| 768 |
"is_anomaly": prediction == -1,
|
| 769 |
"language": lang
|
| 770 |
})
|
|
@@ -783,7 +819,7 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
|
|
| 783 |
"anomalies": anomalies,
|
| 784 |
"total": len(anomalies),
|
| 785 |
"threshold": threshold,
|
| 786 |
-
"model_status": "
|
| 787 |
}
|
| 788 |
|
| 789 |
except Exception as e:
|
|
@@ -1117,42 +1153,57 @@ def remove_intel_target(target_type: str, value: str, platform: Optional[str] =
|
|
| 1117 |
_weather_predictor = None
|
| 1118 |
|
| 1119 |
def get_weather_predictor():
|
| 1120 |
-
"""Lazy-load the weather predictor."""
|
| 1121 |
global _weather_predictor
|
| 1122 |
-
if _weather_predictor is None:
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
|
| 1138 |
-
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1156 |
|
| 1157 |
|
| 1158 |
@app.get("/api/weather/predictions")
|
|
|
|
| 727 |
# Get recent feeds
|
| 728 |
feeds = storage_manager.get_recent_feeds(limit=100)
|
| 729 |
|
| 730 |
+
if not feeds:
|
| 731 |
+
# No feeds yet - return helpful message
|
| 732 |
+
return {
|
| 733 |
+
"anomalies": [],
|
| 734 |
+
"total": 0,
|
| 735 |
+
"model_status": "no_data",
|
| 736 |
+
"message": "No feed data available yet. Wait for graph execution to complete."
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
if not _load_anomaly_components():
|
| 740 |
+
# Use severity + keyword-based scoring as intelligent fallback
|
| 741 |
+
anomalies = []
|
| 742 |
+
anomaly_keywords = ["emergency", "crisis", "breaking", "urgent", "alert",
|
| 743 |
+
"warning", "critical", "disaster", "flood", "protest"]
|
| 744 |
+
|
| 745 |
+
for f in feeds:
|
| 746 |
+
score = 0.0
|
| 747 |
+
summary = str(f.get("summary", "")).lower()
|
| 748 |
+
severity = f.get("severity", "low")
|
| 749 |
+
|
| 750 |
+
# Severity-based scoring
|
| 751 |
+
if severity == "critical": score = 0.9
|
| 752 |
+
elif severity == "high": score = 0.75
|
| 753 |
+
elif severity == "medium": score = 0.5
|
| 754 |
+
else: score = 0.25
|
| 755 |
+
|
| 756 |
+
# Keyword boosting
|
| 757 |
+
keyword_matches = sum(1 for kw in anomaly_keywords if kw in summary)
|
| 758 |
+
if keyword_matches > 0:
|
| 759 |
+
score = min(1.0, score + (keyword_matches * 0.1))
|
| 760 |
+
|
| 761 |
+
# Only include if above threshold
|
| 762 |
+
if score >= threshold:
|
| 763 |
+
anomalies.append({
|
| 764 |
+
**f,
|
| 765 |
+
"anomaly_score": round(score, 3),
|
| 766 |
+
"is_anomaly": score >= 0.7
|
| 767 |
+
})
|
| 768 |
+
|
| 769 |
+
# Sort by anomaly score
|
| 770 |
+
anomalies.sort(key=lambda x: x.get("anomaly_score", 0), reverse=True)
|
| 771 |
+
|
| 772 |
return {
|
| 773 |
+
"anomalies": anomalies[:limit],
|
| 774 |
"total": len(anomalies),
|
| 775 |
+
"threshold": threshold,
|
| 776 |
+
"model_status": "fallback_scoring",
|
| 777 |
+
"message": "Using severity + keyword scoring. Train ML model for advanced detection."
|
| 778 |
}
|
| 779 |
|
| 780 |
+
# ML Model is loaded - use it for scoring
|
| 781 |
anomalies = []
|
| 782 |
for feed in feeds:
|
| 783 |
summary = feed.get("summary", "")
|
|
|
|
| 794 |
else:
|
| 795 |
score = 1.0 if prediction == -1 else 0.0
|
| 796 |
|
| 797 |
+
# Normalize score to 0-1 range
|
| 798 |
normalized_score = max(0, min(1, (score + 0.5)))
|
| 799 |
|
| 800 |
if prediction == -1 or normalized_score >= threshold:
|
| 801 |
anomalies.append({
|
| 802 |
**feed,
|
| 803 |
+
"anomaly_score": float(round(normalized_score, 3)),
|
| 804 |
"is_anomaly": prediction == -1,
|
| 805 |
"language": lang
|
| 806 |
})
|
|
|
|
| 819 |
"anomalies": anomalies,
|
| 820 |
"total": len(anomalies),
|
| 821 |
"threshold": threshold,
|
| 822 |
+
"model_status": "ml_active"
|
| 823 |
}
|
| 824 |
|
| 825 |
except Exception as e:
|
|
|
|
| 1153 |
_weather_predictor = None
|
| 1154 |
|
| 1155 |
def get_weather_predictor():
|
| 1156 |
+
"""Lazy-load the weather predictor using isolated import."""
|
| 1157 |
global _weather_predictor
|
| 1158 |
+
if _weather_predictor is not None:
|
| 1159 |
+
return _weather_predictor
|
| 1160 |
+
|
| 1161 |
+
try:
|
| 1162 |
+
import importlib.util
|
| 1163 |
+
from pathlib import Path
|
| 1164 |
+
|
| 1165 |
+
# Use importlib.util for fully isolated import (avoids package collisions)
|
| 1166 |
+
weather_src = Path(__file__).parent / "models" / "weather-prediction" / "src"
|
| 1167 |
+
predictor_path = weather_src / "components" / "predictor.py"
|
| 1168 |
+
|
| 1169 |
+
if not predictor_path.exists():
|
| 1170 |
+
logger.error(f"[WeatherAPI] predictor.py not found at {predictor_path}")
|
| 1171 |
+
return None
|
| 1172 |
+
|
| 1173 |
+
# First, ensure entity module is loadable
|
| 1174 |
+
entity_path = weather_src / "entity" / "config_entity.py"
|
| 1175 |
+
if entity_path.exists():
|
| 1176 |
+
entity_spec = importlib.util.spec_from_file_location(
|
| 1177 |
+
"weather_config_entity",
|
| 1178 |
+
str(entity_path)
|
| 1179 |
+
)
|
| 1180 |
+
entity_module = importlib.util.module_from_spec(entity_spec)
|
| 1181 |
+
sys.modules["weather_config_entity"] = entity_module
|
| 1182 |
+
entity_spec.loader.exec_module(entity_module)
|
| 1183 |
+
|
| 1184 |
+
# Add weather src to path temporarily for relative imports
|
| 1185 |
+
import sys
|
| 1186 |
+
weather_src_str = str(weather_src)
|
| 1187 |
+
if weather_src_str not in sys.path:
|
| 1188 |
+
sys.path.insert(0, weather_src_str)
|
| 1189 |
+
|
| 1190 |
+
# Now load predictor module
|
| 1191 |
+
spec = importlib.util.spec_from_file_location(
|
| 1192 |
+
"weather_predictor_module",
|
| 1193 |
+
str(predictor_path)
|
| 1194 |
+
)
|
| 1195 |
+
module = importlib.util.module_from_spec(spec)
|
| 1196 |
+
spec.loader.exec_module(module)
|
| 1197 |
+
|
| 1198 |
+
_weather_predictor = module.WeatherPredictor()
|
| 1199 |
+
logger.info("[WeatherAPI] ✓ Weather predictor initialized via isolated import")
|
| 1200 |
+
return _weather_predictor
|
| 1201 |
+
|
| 1202 |
+
except Exception as e:
|
| 1203 |
+
logger.error(f"[WeatherAPI] Failed to initialize predictor: {e}")
|
| 1204 |
+
import traceback
|
| 1205 |
+
logger.debug(traceback.format_exc())
|
| 1206 |
+
return None
|
| 1207 |
|
| 1208 |
|
| 1209 |
@app.get("/api/weather/predictions")
|
models/anomaly-detection/main.py
CHANGED
|
@@ -4,16 +4,11 @@ Entry point for the anomaly detection training pipeline
|
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import sys
|
| 7 |
-
import logging
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
from src.pipeline import run_training_pipeline
|
| 14 |
-
from src.entity import PipelineConfig
|
| 15 |
-
|
| 16 |
-
# Configure logging
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
| 19 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
@@ -22,9 +17,14 @@ logging.basicConfig(
|
|
| 22 |
logging.FileHandler("training.log")
|
| 23 |
]
|
| 24 |
)
|
| 25 |
-
|
| 26 |
logger = logging.getLogger("main")
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def main():
|
| 30 |
"""Run the anomaly detection training pipeline"""
|
|
|
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import sys
|
| 7 |
+
import logging # Import standard library BEFORE path manipulation
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
+
# CRITICAL: Configure logging BEFORE adding src/ to path
|
| 11 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
logging.basicConfig(
|
| 13 |
level=logging.INFO,
|
| 14 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
|
|
| 17 |
logging.FileHandler("training.log")
|
| 18 |
]
|
| 19 |
)
|
|
|
|
| 20 |
logger = logging.getLogger("main")
|
| 21 |
|
| 22 |
+
# Add src to path - AFTER logging is configured
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 24 |
+
|
| 25 |
+
from src.pipeline import run_training_pipeline
|
| 26 |
+
from src.entity import PipelineConfig
|
| 27 |
+
|
| 28 |
|
| 29 |
def main():
|
| 30 |
"""Run the anomaly detection training pipeline"""
|
models/anomaly-detection/src/pipeline/train.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Anomaly Detection Training Script
|
| 3 |
+
Convenience wrapper for: python models/anomaly-detection/main.py
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python models/anomaly-detection/src/pipeline/train.py
|
| 7 |
+
"""
|
| 8 |
+
import sys
|
| 9 |
+
import argparse
|
| 10 |
+
import logging # Import BEFORE path manipulation
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Configure logging BEFORE adding src/ to path
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
level=logging.INFO,
|
| 16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Add parent directories to path - AFTER logging is configured
|
| 20 |
+
PIPELINE_ROOT = Path(__file__).parent.parent.parent
|
| 21 |
+
sys.path.insert(0, str(PIPELINE_ROOT))
|
| 22 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
parser = argparse.ArgumentParser(description="Anomaly Detection Training")
|
| 26 |
+
parser.add_argument("--help-only", action="store_true", help="Show help and exit")
|
| 27 |
+
|
| 28 |
+
# Parse known args to allow --help to work without loading heavy modules
|
| 29 |
+
args, _ = parser.parse_known_args()
|
| 30 |
+
|
| 31 |
+
print("=" * 60)
|
| 32 |
+
print("ANOMALY DETECTION - TRAINING PIPELINE")
|
| 33 |
+
print("=" * 60)
|
| 34 |
+
|
| 35 |
+
# Import and run from main.py
|
| 36 |
+
from main import main
|
| 37 |
+
|
| 38 |
+
result = main()
|
| 39 |
+
|
| 40 |
+
if result:
|
| 41 |
+
print("=" * 60)
|
| 42 |
+
print("TRAINING COMPLETE!")
|
| 43 |
+
print(f"Best model: {result.model_trainer.best_model_name}")
|
| 44 |
+
print("=" * 60)
|
models/currency-volatility-prediction/main.py
CHANGED
|
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
-
import logging
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 16 |
-
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
| 19 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 20 |
)
|
| 21 |
logger = logging.getLogger("currency_prediction")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def run_data_ingestion(period: str = "2y"):
|
| 25 |
"""Run data ingestion from yfinance."""
|
|
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
+
import logging # Import standard library BEFORE path manipulation
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
+
# CRITICAL: Configure logging BEFORE adding src/ to path
|
| 14 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
|
|
|
|
|
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
| 17 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 18 |
)
|
| 19 |
logger = logging.getLogger("currency_prediction")
|
| 20 |
|
| 21 |
+
# Setup paths - AFTER logging is configured
|
| 22 |
+
PIPELINE_ROOT = Path(__file__).parent
|
| 23 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 24 |
+
|
| 25 |
|
| 26 |
def run_data_ingestion(period: str = "2y"):
|
| 27 |
"""Run data ingestion from yfinance."""
|
models/currency-volatility-prediction/src/pipeline/train.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Currency Volatility Prediction Training Script
|
| 3 |
+
Convenience wrapper for: python models/currency-volatility-prediction/main.py --mode train
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python models/currency-volatility-prediction/src/pipeline/train.py [--epochs 100] [--period 2y]
|
| 7 |
+
"""
|
| 8 |
+
import sys
|
| 9 |
+
import argparse
|
| 10 |
+
import logging # CRITICAL: Import BEFORE path manipulation
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Configure logging BEFORE adding src/ to path
|
| 14 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# Add parent directories to path - AFTER logging is configured
|
| 21 |
+
PIPELINE_ROOT = Path(__file__).parent.parent.parent
|
| 22 |
+
sys.path.insert(0, str(PIPELINE_ROOT))
|
| 23 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
parser = argparse.ArgumentParser(description="Currency Prediction Training")
|
| 27 |
+
parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
|
| 28 |
+
parser.add_argument("--period", type=str, default="2y", help="Data period (1y, 2y, 5y)")
|
| 29 |
+
parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
|
| 30 |
+
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
|
| 33 |
+
# Import from main.py (after path setup)
|
| 34 |
+
from main import run_training, run_full_pipeline, run_data_ingestion
|
| 35 |
+
|
| 36 |
+
print("=" * 60)
|
| 37 |
+
print("CURRENCY (USD/LKR) PREDICTION - TRAINING PIPELINE")
|
| 38 |
+
print("=" * 60)
|
| 39 |
+
|
| 40 |
+
if args.full:
|
| 41 |
+
run_full_pipeline()
|
| 42 |
+
else:
|
| 43 |
+
# Run data ingestion first if no data exists
|
| 44 |
+
try:
|
| 45 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 46 |
+
ingestion = CurrencyDataIngestion()
|
| 47 |
+
df = ingestion.load_existing()
|
| 48 |
+
print(f"✓ Found existing data: {len(df)} records")
|
| 49 |
+
except FileNotFoundError:
|
| 50 |
+
print("No existing data, running ingestion first...")
|
| 51 |
+
run_data_ingestion(period=args.period)
|
| 52 |
+
|
| 53 |
+
# Run training
|
| 54 |
+
run_training(epochs=args.epochs)
|
| 55 |
+
|
| 56 |
+
print("=" * 60)
|
| 57 |
+
print("TRAINING COMPLETE!")
|
| 58 |
+
print("=" * 60)
|
models/stock-price-prediction/main.py
CHANGED
|
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
-
import logging
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 16 |
-
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
| 19 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 20 |
)
|
| 21 |
logger = logging.getLogger("stock_prediction")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def run_data_ingestion():
|
| 25 |
"""Run data ingestion for all stocks."""
|
|
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
+
import logging # Import standard library BEFORE path manipulation
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
+
# CRITICAL: Configure logging BEFORE adding src/ to path
|
| 14 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
|
|
|
|
|
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
| 17 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 18 |
)
|
| 19 |
logger = logging.getLogger("stock_prediction")
|
| 20 |
|
| 21 |
+
# Setup paths - AFTER logging is configured
|
| 22 |
+
PIPELINE_ROOT = Path(__file__).parent
|
| 23 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 24 |
+
|
| 25 |
|
| 26 |
def run_data_ingestion():
|
| 27 |
"""Run data ingestion for all stocks."""
|
models/stock-price-prediction/src/pipeline/train.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stock Price Prediction Training Script
|
| 3 |
+
Convenience wrapper for: python models/stock-price-prediction/main.py --mode train
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python models/stock-price-prediction/src/pipeline/train.py [--stock JKH] [--no-optuna] [--full]
|
| 7 |
+
"""
|
| 8 |
+
import sys
|
| 9 |
+
import argparse
|
| 10 |
+
import logging # CRITICAL: Import BEFORE path manipulation
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Configure logging BEFORE adding src/ to path
|
| 14 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# Add parent directories to path - AFTER logging is configured
|
| 21 |
+
PIPELINE_ROOT = Path(__file__).parent.parent.parent
|
| 22 |
+
sys.path.insert(0, str(PIPELINE_ROOT))
|
| 23 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
parser = argparse.ArgumentParser(description="Stock Price Prediction Training")
|
| 27 |
+
parser.add_argument("--stock", type=str, default=None, help="Specific stock to train (e.g., JKH, COMB)")
|
| 28 |
+
parser.add_argument("--no-optuna", action="store_true", help="Disable Optuna hyperparameter optimization")
|
| 29 |
+
parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
|
| 30 |
+
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
use_optuna = not args.no_optuna
|
| 33 |
+
|
| 34 |
+
# Import from main.py (after path setup)
|
| 35 |
+
from main import run_training, run_full_pipeline, run_data_ingestion
|
| 36 |
+
|
| 37 |
+
print("=" * 60)
|
| 38 |
+
print("STOCK PRICE (CSE) PREDICTION - TRAINING PIPELINE")
|
| 39 |
+
print("=" * 60)
|
| 40 |
+
|
| 41 |
+
if args.full:
|
| 42 |
+
run_full_pipeline(use_optuna=use_optuna)
|
| 43 |
+
else:
|
| 44 |
+
# Run data ingestion first if no data exists
|
| 45 |
+
try:
|
| 46 |
+
from components.data_ingestion import StockDataIngestion
|
| 47 |
+
ingestion = StockDataIngestion()
|
| 48 |
+
stocks = list(ingestion.config.stocks.keys())
|
| 49 |
+
df = ingestion.load_stock_data(stocks[0])
|
| 50 |
+
if df is not None:
|
| 51 |
+
print(f"✓ Found existing data for {len(stocks)} stocks")
|
| 52 |
+
else:
|
| 53 |
+
raise FileNotFoundError()
|
| 54 |
+
except (FileNotFoundError, Exception):
|
| 55 |
+
print("No existing data, running ingestion first...")
|
| 56 |
+
run_data_ingestion()
|
| 57 |
+
|
| 58 |
+
# Run training
|
| 59 |
+
run_training(use_optuna=use_optuna, stock=args.stock)
|
| 60 |
+
|
| 61 |
+
print("=" * 60)
|
| 62 |
+
print("TRAINING COMPLETE!")
|
| 63 |
+
print("=" * 60)
|
models/weather-prediction/main.py
CHANGED
|
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
-
import logging
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 16 |
-
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
| 19 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 20 |
)
|
| 21 |
logger = logging.getLogger("weather_prediction")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def run_data_ingestion(months: int = 12):
|
| 25 |
"""Run data ingestion for all stations."""
|
|
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
+
import logging # Import standard library BEFORE path manipulation
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
|
| 13 |
+
# CRITICAL: Configure logging BEFORE adding src/ to path
|
| 14 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
|
|
|
|
|
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
| 17 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 18 |
)
|
| 19 |
logger = logging.getLogger("weather_prediction")
|
| 20 |
|
| 21 |
+
# Setup paths - AFTER logging is configured
|
| 22 |
+
PIPELINE_ROOT = Path(__file__).parent
|
| 23 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 24 |
+
|
| 25 |
|
| 26 |
def run_data_ingestion(months: int = 12):
|
| 27 |
"""Run data ingestion for all stations."""
|
models/weather-prediction/pyproject.toml
CHANGED
|
@@ -10,6 +10,7 @@ dependencies = [
|
|
| 10 |
"fastapi>=0.122.0",
|
| 11 |
"mlflow>=3.6.0",
|
| 12 |
"numpy>=2.3.5",
|
|
|
|
| 13 |
"pandas>=2.3.3",
|
| 14 |
"pyaml>=25.7.0",
|
| 15 |
"pymongo[srv]>=4.15.4",
|
|
|
|
| 10 |
"fastapi>=0.122.0",
|
| 11 |
"mlflow>=3.6.0",
|
| 12 |
"numpy>=2.3.5",
|
| 13 |
+
"optuna>=4.6.0",
|
| 14 |
"pandas>=2.3.3",
|
| 15 |
"pyaml>=25.7.0",
|
| 16 |
"pymongo[srv]>=4.15.4",
|
models/weather-prediction/src/components/data_ingestion.py
CHANGED
|
@@ -36,6 +36,7 @@ class DataIngestion:
|
|
| 36 |
def ingest_all(self) -> str:
|
| 37 |
"""
|
| 38 |
Ingest historical weather data for all stations.
|
|
|
|
| 39 |
|
| 40 |
Returns:
|
| 41 |
Path to saved CSV file
|
|
@@ -55,9 +56,73 @@ class DataIngestion:
|
|
| 55 |
save_path=save_path
|
| 56 |
)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
|
| 59 |
return save_path
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
|
| 62 |
"""
|
| 63 |
Ingest data for a single station.
|
|
|
|
| 36 |
def ingest_all(self) -> str:
|
| 37 |
"""
|
| 38 |
Ingest historical weather data for all stations.
|
| 39 |
+
Falls back to synthetic data if scraping fails.
|
| 40 |
|
| 41 |
Returns:
|
| 42 |
Path to saved CSV file
|
|
|
|
| 56 |
save_path=save_path
|
| 57 |
)
|
| 58 |
|
| 59 |
+
# Fallback to synthetic data if scraping failed
|
| 60 |
+
if df.empty or len(df) < 100:
|
| 61 |
+
logger.warning("[DATA_INGESTION] Scraping failed or insufficient data. Generating synthetic training data.")
|
| 62 |
+
df = self._generate_synthetic_data()
|
| 63 |
+
df.to_csv(save_path, index=False)
|
| 64 |
+
logger.info(f"[DATA_INGESTION] Generated {len(df)} synthetic records")
|
| 65 |
+
|
| 66 |
logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
|
| 67 |
return save_path
|
| 68 |
|
| 69 |
+
def _generate_synthetic_data(self) -> pd.DataFrame:
|
| 70 |
+
"""
|
| 71 |
+
Generate synthetic weather data for training when scraping fails.
|
| 72 |
+
Uses realistic Sri Lankan climate patterns.
|
| 73 |
+
"""
|
| 74 |
+
import numpy as np
|
| 75 |
+
|
| 76 |
+
# Generate 1 year of daily data for priority stations
|
| 77 |
+
priority_stations = ["COLOMBO", "KANDY", "JAFFNA", "BATTICALOA", "RATNAPURA"]
|
| 78 |
+
|
| 79 |
+
records = []
|
| 80 |
+
for station in priority_stations:
|
| 81 |
+
if station not in self.config.stations:
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
config = self.config.stations[station]
|
| 85 |
+
|
| 86 |
+
# Generate 365 days of data
|
| 87 |
+
for day_offset in range(365):
|
| 88 |
+
date = datetime.now() - pd.Timedelta(days=day_offset)
|
| 89 |
+
month = date.month
|
| 90 |
+
|
| 91 |
+
# Monsoon-aware temperature (more realistic for Sri Lanka)
|
| 92 |
+
# South-West monsoon: May-Sep, North-East: Dec-Feb
|
| 93 |
+
base_temp = 28 if month in [3, 4, 5, 6, 7, 8] else 26
|
| 94 |
+
temp_variation = np.random.normal(0, 2)
|
| 95 |
+
temp_mean = base_temp + temp_variation
|
| 96 |
+
|
| 97 |
+
# Monsoon rainfall patterns
|
| 98 |
+
if month in [10, 11, 12]: # NE monsoon - heavy rain
|
| 99 |
+
rainfall = max(0, np.random.exponential(15))
|
| 100 |
+
elif month in [5, 6, 7]: # SW monsoon - moderate rain
|
| 101 |
+
rainfall = max(0, np.random.exponential(10))
|
| 102 |
+
else: # Inter-monsoon / dry
|
| 103 |
+
rainfall = max(0, np.random.exponential(3))
|
| 104 |
+
|
| 105 |
+
records.append({
|
| 106 |
+
"date": date.strftime("%Y-%m-%d"),
|
| 107 |
+
"year": date.year,
|
| 108 |
+
"month": month,
|
| 109 |
+
"day": date.day,
|
| 110 |
+
"station_code": config["code"],
|
| 111 |
+
"station_name": station,
|
| 112 |
+
"temp_mean": round(temp_mean, 1),
|
| 113 |
+
"temp_max": round(temp_mean + np.random.uniform(3, 6), 1),
|
| 114 |
+
"temp_min": round(temp_mean - np.random.uniform(3, 5), 1),
|
| 115 |
+
"rainfall": round(rainfall, 1),
|
| 116 |
+
"humidity": round(np.random.uniform(65, 90), 1),
|
| 117 |
+
"wind_speed": round(np.random.uniform(5, 25), 1),
|
| 118 |
+
"pressure": round(np.random.uniform(1008, 1015), 1),
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
df = pd.DataFrame(records)
|
| 122 |
+
df["date"] = pd.to_datetime(df["date"])
|
| 123 |
+
df = df.sort_values(["station_name", "date"]).reset_index(drop=True)
|
| 124 |
+
return df
|
| 125 |
+
|
| 126 |
def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
|
| 127 |
"""
|
| 128 |
Ingest data for a single station.
|
models/weather-prediction/src/pipeline/train.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Weather Prediction Training Script
|
| 3 |
+
Convenience wrapper for: python models/weather-prediction/main.py --mode train
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python models/weather-prediction/src/pipeline/train.py [--station COLOMBO] [--epochs 100]
|
| 7 |
+
"""
|
| 8 |
+
import sys
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# CRITICAL: Import standard library logging BEFORE adding src/ to path
|
| 13 |
+
# (src/logging/ directory would otherwise shadow the standard module)
|
| 14 |
+
import logging
|
| 15 |
+
|
| 16 |
+
# Add parent directories to path
|
| 17 |
+
PIPELINE_ROOT = Path(__file__).parent.parent.parent
|
| 18 |
+
sys.path.insert(0, str(PIPELINE_ROOT))
|
| 19 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
parser = argparse.ArgumentParser(description="Weather Prediction Training")
|
| 23 |
+
parser.add_argument("--station", type=str, default=None, help="Station to train (e.g., COLOMBO)")
|
| 24 |
+
parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
|
| 25 |
+
parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
|
| 26 |
+
|
| 27 |
+
args = parser.parse_args()
|
| 28 |
+
|
| 29 |
+
# Import from main.py (after path setup)
|
| 30 |
+
from main import run_training, run_full_pipeline, run_data_ingestion
|
| 31 |
+
|
| 32 |
+
print("=" * 60)
|
| 33 |
+
print("WEATHER PREDICTION - TRAINING PIPELINE")
|
| 34 |
+
print("=" * 60)
|
| 35 |
+
|
| 36 |
+
if args.full:
|
| 37 |
+
run_full_pipeline()
|
| 38 |
+
else:
|
| 39 |
+
# Run data ingestion first if no data exists
|
| 40 |
+
try:
|
| 41 |
+
from components.data_ingestion import DataIngestion
|
| 42 |
+
ingestion = DataIngestion()
|
| 43 |
+
df = ingestion.load_existing()
|
| 44 |
+
print(f"✓ Found existing data: {len(df)} records")
|
| 45 |
+
except FileNotFoundError:
|
| 46 |
+
print("No existing data, running ingestion first...")
|
| 47 |
+
run_data_ingestion(months=3)
|
| 48 |
+
|
| 49 |
+
# Run training
|
| 50 |
+
run_training(station=args.station, epochs=args.epochs)
|
| 51 |
+
|
| 52 |
+
print("=" * 60)
|
| 53 |
+
print("TRAINING COMPLETE!")
|
| 54 |
+
print("=" * 60)
|
| 55 |
+
|
models/weather-prediction/src/utils/tutiempo_scraper.py
CHANGED
|
@@ -175,19 +175,32 @@ class TutiempoScraper:
|
|
| 175 |
DataFrame with all historical records
|
| 176 |
"""
|
| 177 |
all_records = []
|
|
|
|
|
|
|
|
|
|
| 178 |
current = datetime.now()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
for i in range(months):
|
| 181 |
-
target_date =
|
| 182 |
year = target_date.year
|
| 183 |
month = target_date.month
|
| 184 |
|
| 185 |
records = self.scrape_month(station_code, year, month)
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# Be nice to the server
|
| 193 |
time.sleep(1)
|
|
|
|
| 175 |
DataFrame with all historical records
|
| 176 |
"""
|
| 177 |
all_records = []
|
| 178 |
+
|
| 179 |
+
# IMPORTANT: TuTiempo has data publication delay of ~2-3 months
|
| 180 |
+
# Start from 3 months ago to avoid 404 errors on recent months
|
| 181 |
current = datetime.now()
|
| 182 |
+
start_date = current - timedelta(days=90) # Start 3 months ago
|
| 183 |
+
|
| 184 |
+
consecutive_failures = 0
|
| 185 |
+
max_consecutive_failures = 3
|
| 186 |
|
| 187 |
for i in range(months):
|
| 188 |
+
target_date = start_date - timedelta(days=30 * i)
|
| 189 |
year = target_date.year
|
| 190 |
month = target_date.month
|
| 191 |
|
| 192 |
records = self.scrape_month(station_code, year, month)
|
| 193 |
|
| 194 |
+
if not records:
|
| 195 |
+
consecutive_failures += 1
|
| 196 |
+
if consecutive_failures >= max_consecutive_failures:
|
| 197 |
+
logger.warning(f"[TUTIEMPO] {max_consecutive_failures} consecutive failures for {station_name}, stopping")
|
| 198 |
+
break
|
| 199 |
+
else:
|
| 200 |
+
consecutive_failures = 0 # Reset on success
|
| 201 |
+
for r in records:
|
| 202 |
+
r["station_name"] = station_name
|
| 203 |
+
all_records.extend(records)
|
| 204 |
|
| 205 |
# Be nice to the server
|
| 206 |
time.sleep(1)
|
models/weather-prediction/uv.lock
CHANGED
|
@@ -298,6 +298,18 @@ wheels = [
|
|
| 298 |
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
| 299 |
]
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
[[package]]
|
| 302 |
name = "contourpy"
|
| 303 |
version = "1.3.3"
|
|
@@ -1375,6 +1387,7 @@ dependencies = [
|
|
| 1375 |
{ name = "fastapi" },
|
| 1376 |
{ name = "mlflow" },
|
| 1377 |
{ name = "numpy" },
|
|
|
|
| 1378 |
{ name = "pandas" },
|
| 1379 |
{ name = "pyaml" },
|
| 1380 |
{ name = "pymongo" },
|
|
@@ -1391,6 +1404,7 @@ requires-dist = [
|
|
| 1391 |
{ name = "fastapi", specifier = ">=0.122.0" },
|
| 1392 |
{ name = "mlflow", specifier = ">=3.6.0" },
|
| 1393 |
{ name = "numpy", specifier = ">=2.3.5" },
|
|
|
|
| 1394 |
{ name = "pandas", specifier = ">=2.3.3" },
|
| 1395 |
{ name = "pyaml", specifier = ">=25.7.0" },
|
| 1396 |
{ name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
|
|
@@ -1659,6 +1673,24 @@ wheels = [
|
|
| 1659 |
{ url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
|
| 1660 |
]
|
| 1661 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1662 |
[[package]]
|
| 1663 |
name = "packaging"
|
| 1664 |
version = "25.0"
|
|
@@ -2615,6 +2647,18 @@ wheels = [
|
|
| 2615 |
{ url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
|
| 2616 |
]
|
| 2617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2618 |
[[package]]
|
| 2619 |
name = "treelib"
|
| 2620 |
version = "1.8.0"
|
|
|
|
| 298 |
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
| 299 |
]
|
| 300 |
|
| 301 |
+
[[package]]
|
| 302 |
+
name = "colorlog"
|
| 303 |
+
version = "6.10.1"
|
| 304 |
+
source = { registry = "https://pypi.org/simple" }
|
| 305 |
+
dependencies = [
|
| 306 |
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
| 307 |
+
]
|
| 308 |
+
sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" }
|
| 309 |
+
wheels = [
|
| 310 |
+
{ url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
[[package]]
|
| 314 |
name = "contourpy"
|
| 315 |
version = "1.3.3"
|
|
|
|
| 1387 |
{ name = "fastapi" },
|
| 1388 |
{ name = "mlflow" },
|
| 1389 |
{ name = "numpy" },
|
| 1390 |
+
{ name = "optuna" },
|
| 1391 |
{ name = "pandas" },
|
| 1392 |
{ name = "pyaml" },
|
| 1393 |
{ name = "pymongo" },
|
|
|
|
| 1404 |
{ name = "fastapi", specifier = ">=0.122.0" },
|
| 1405 |
{ name = "mlflow", specifier = ">=3.6.0" },
|
| 1406 |
{ name = "numpy", specifier = ">=2.3.5" },
|
| 1407 |
+
{ name = "optuna", specifier = ">=4.6.0" },
|
| 1408 |
{ name = "pandas", specifier = ">=2.3.3" },
|
| 1409 |
{ name = "pyaml", specifier = ">=25.7.0" },
|
| 1410 |
{ name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
|
|
|
|
| 1673 |
{ url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
|
| 1674 |
]
|
| 1675 |
|
| 1676 |
+
[[package]]
|
| 1677 |
+
name = "optuna"
|
| 1678 |
+
version = "4.6.0"
|
| 1679 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1680 |
+
dependencies = [
|
| 1681 |
+
{ name = "alembic" },
|
| 1682 |
+
{ name = "colorlog" },
|
| 1683 |
+
{ name = "numpy" },
|
| 1684 |
+
{ name = "packaging" },
|
| 1685 |
+
{ name = "pyyaml" },
|
| 1686 |
+
{ name = "sqlalchemy" },
|
| 1687 |
+
{ name = "tqdm" },
|
| 1688 |
+
]
|
| 1689 |
+
sdist = { url = "https://files.pythonhosted.org/packages/6b/81/08f90f194eed78178064a9383432eca95611e2c5331e7b01e2418ce4b15a/optuna-4.6.0.tar.gz", hash = "sha256:89e38c2447c7f793a726617b8043f01e31f0bad54855040db17eb3b49404a369", size = 477444, upload-time = "2025-11-10T05:14:30.151Z" }
|
| 1690 |
+
wheels = [
|
| 1691 |
+
{ url = "https://files.pythonhosted.org/packages/58/de/3d8455b08cb6312f8cc46aacdf16c71d4d881a1db4a4140fc5ef31108422/optuna-4.6.0-py3-none-any.whl", hash = "sha256:4c3a9facdef2b2dd7e3e2a8ae3697effa70fae4056fcf3425cfc6f5a40feb069", size = 404708, upload-time = "2025-11-10T05:14:28.6Z" },
|
| 1692 |
+
]
|
| 1693 |
+
|
| 1694 |
[[package]]
|
| 1695 |
name = "packaging"
|
| 1696 |
version = "25.0"
|
|
|
|
| 2647 |
{ url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
|
| 2648 |
]
|
| 2649 |
|
| 2650 |
+
[[package]]
|
| 2651 |
+
name = "tqdm"
|
| 2652 |
+
version = "4.67.1"
|
| 2653 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2654 |
+
dependencies = [
|
| 2655 |
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
| 2656 |
+
]
|
| 2657 |
+
sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
|
| 2658 |
+
wheels = [
|
| 2659 |
+
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
|
| 2660 |
+
]
|
| 2661 |
+
|
| 2662 |
[[package]]
|
| 2663 |
name = "treelib"
|
| 2664 |
version = "1.8.0"
|
requirements.txt
CHANGED
|
@@ -111,3 +111,5 @@ pyyaml
|
|
| 111 |
# Utilities
|
| 112 |
joblib
|
| 113 |
tqdm
|
|
|
|
|
|
|
|
|
| 111 |
# Utilities
|
| 112 |
joblib
|
| 113 |
tqdm
|
| 114 |
+
|
| 115 |
+
optuna
|
src/config/intel_config.json
CHANGED
|
@@ -1,72 +1,85 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
"
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
"
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
"
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"user_profiles": {
|
| 3 |
+
"twitter": [
|
| 4 |
+
"nivakaran"
|
| 5 |
+
],
|
| 6 |
+
"facebook": [
|
| 7 |
+
"Nivakaran"
|
| 8 |
+
],
|
| 9 |
+
"linkedin": [
|
| 10 |
+
"nivakaran"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
"user_keywords": [
|
| 14 |
+
"Colombo",
|
| 15 |
+
"nivakaran",
|
| 16 |
+
"telco"
|
| 17 |
+
],
|
| 18 |
+
"user_products": [
|
| 19 |
+
"iphone",
|
| 20 |
+
"anchor"
|
| 21 |
+
],
|
| 22 |
+
"operational_keywords": {
|
| 23 |
+
"infrastructure": [
|
| 24 |
+
"Colombo port",
|
| 25 |
+
"Hambantota port",
|
| 26 |
+
"port strike",
|
| 27 |
+
"power outage",
|
| 28 |
+
"water shortage",
|
| 29 |
+
"fuel shortage",
|
| 30 |
+
"airport delay",
|
| 31 |
+
"customs clearance",
|
| 32 |
+
"road closure",
|
| 33 |
+
"railway disruption"
|
| 34 |
+
],
|
| 35 |
+
"government": [
|
| 36 |
+
"cabinet decision",
|
| 37 |
+
"new policy",
|
| 38 |
+
"regulation change",
|
| 39 |
+
"tax amendment",
|
| 40 |
+
"import restriction",
|
| 41 |
+
"export ban",
|
| 42 |
+
"license requirement",
|
| 43 |
+
"central bank",
|
| 44 |
+
"budget announcement"
|
| 45 |
+
],
|
| 46 |
+
"opportunity": [
|
| 47 |
+
"investment",
|
| 48 |
+
"expansion",
|
| 49 |
+
"new factory",
|
| 50 |
+
"job creation",
|
| 51 |
+
"export growth",
|
| 52 |
+
"tourism boost",
|
| 53 |
+
"infrastructure project",
|
| 54 |
+
"development grant",
|
| 55 |
+
"FDI",
|
| 56 |
+
"trade agreement"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
"alert_thresholds": {
|
| 60 |
+
"trending_momentum_min": 2.0,
|
| 61 |
+
"spike_multiplier": 3.0,
|
| 62 |
+
"high_risk_score": 0.7,
|
| 63 |
+
"high_opportunity_score": 0.6
|
| 64 |
+
},
|
| 65 |
+
"default_competitors": {
|
| 66 |
+
"telecom": {
|
| 67 |
+
"twitter": [
|
| 68 |
+
"DialogSriLanka",
|
| 69 |
+
"Mobaborang",
|
| 70 |
+
"HutchSL"
|
| 71 |
+
],
|
| 72 |
+
"facebook": [
|
| 73 |
+
"Dialog",
|
| 74 |
+
"SLT-Mobitel",
|
| 75 |
+
"Hutch"
|
| 76 |
+
]
|
| 77 |
}
|
| 78 |
+
},
|
| 79 |
+
"notes": {
|
| 80 |
+
"removed_profiles": [
|
| 81 |
+
"SLTMobitel - Twitter profile not found/restricted"
|
| 82 |
+
],
|
| 83 |
+
"last_verified": "2025-12-08"
|
| 84 |
+
}
|
| 85 |
}
|
src/nodes/intelligenceAgentNode.py
CHANGED
|
@@ -396,16 +396,19 @@ class IntelligenceAgentNode:
|
|
| 396 |
|
| 397 |
def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
|
| 398 |
"""
|
| 399 |
-
Generate competitive intelligence summary using LLM
|
| 400 |
"""
|
| 401 |
-
print("[MODULE 3B] Generating LLM Summary")
|
| 402 |
|
| 403 |
all_results = state.get("worker_results", [])
|
| 404 |
profile_feeds = state.get("profile_feeds", {})
|
| 405 |
competitor_feeds = state.get("competitor_feeds", {})
|
| 406 |
product_feeds = state.get("product_review_feeds", {})
|
| 407 |
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
| 409 |
summary_data = {
|
| 410 |
"total_results": len(all_results),
|
| 411 |
"profiles_monitored": list(profile_feeds.keys()),
|
|
@@ -415,42 +418,90 @@ class IntelligenceAgentNode:
|
|
| 415 |
"global_competitors": len(state.get("global_intel", []))
|
| 416 |
}
|
| 417 |
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
- Local market intelligence: {summary_data['local_competitors']} items
|
| 427 |
-
- Global market intelligence: {summary_data['global_competitors']} items
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
|
|
|
| 434 |
|
| 435 |
-
|
| 436 |
-
"""
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
try:
|
| 439 |
response = self.llm.invoke(prompt)
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
except Exception as e:
|
| 443 |
-
llm_summary = f"LLM Summary unavailable: {e}"
|
| 444 |
print(f" ⚠️ LLM error: {e}")
|
| 445 |
|
| 446 |
return {
|
| 447 |
"llm_summary": llm_summary,
|
|
|
|
| 448 |
"structured_output": summary_data
|
| 449 |
}
|
| 450 |
|
| 451 |
def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
|
| 452 |
"""
|
| 453 |
-
Module 3C: Format final competitive intelligence feed
|
| 454 |
"""
|
| 455 |
print("[MODULE 3C] Formatting Final Output")
|
| 456 |
|
|
@@ -458,6 +509,7 @@ class IntelligenceAgentNode:
|
|
| 458 |
competitor_feeds = state.get("competitor_feeds", {})
|
| 459 |
product_feeds = state.get("product_review_feeds", {})
|
| 460 |
llm_summary = state.get("llm_summary", "No summary available")
|
|
|
|
| 461 |
local_intel = state.get("local_intel", [])
|
| 462 |
global_intel = state.get("global_intel", [])
|
| 463 |
|
|
@@ -491,84 +543,66 @@ Source: Multi-platform competitive intelligence (Twitter, Facebook, LinkedIn, In
|
|
| 491 |
"""
|
| 492 |
|
| 493 |
# Create integration output with structured data
|
| 494 |
-
# FIXED: Pass actual feed data, not just counts
|
| 495 |
structured_feeds = {
|
| 496 |
-
"profiles": profile_feeds,
|
| 497 |
-
"competitors": competitor_feeds,
|
| 498 |
-
"products": product_feeds,
|
| 499 |
"local_intel": local_intel,
|
| 500 |
"global_intel": global_intel
|
| 501 |
}
|
| 502 |
|
| 503 |
-
# Create list for
|
| 504 |
domain_insights = []
|
| 505 |
timestamp = datetime.utcnow().isoformat()
|
| 506 |
|
| 507 |
-
# 1
|
| 508 |
-
for
|
| 509 |
-
if
|
| 510 |
-
continue
|
| 511 |
-
for post in posts[:5]:
|
| 512 |
-
post_text = post.get("text", "") or post.get("title", "")
|
| 513 |
-
if not post_text or len(post_text) < 10:
|
| 514 |
-
continue
|
| 515 |
domain_insights.append({
|
| 516 |
"source_event_id": str(uuid.uuid4()),
|
| 517 |
"domain": "intelligence",
|
| 518 |
-
"summary": f"
|
| 519 |
-
"severity": "medium",
|
| 520 |
-
"impact_type": "risk",
|
| 521 |
-
"timestamp": timestamp
|
|
|
|
| 522 |
})
|
| 523 |
|
| 524 |
-
|
| 525 |
-
for competitor, posts in competitor_feeds.items():
|
| 526 |
-
if not isinstance(posts, list):
|
| 527 |
-
continue
|
| 528 |
-
for post in posts[:5]:
|
| 529 |
-
post_text = post.get("text", "") or post.get("title", "")
|
| 530 |
-
if not post_text or len(post_text) < 10:
|
| 531 |
-
continue
|
| 532 |
-
severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
|
| 533 |
-
domain_insights.append({
|
| 534 |
-
"source_event_id": str(uuid.uuid4()),
|
| 535 |
-
"domain": "intelligence",
|
| 536 |
-
"summary": f"Competitor ({competitor}): {post_text[:200]}",
|
| 537 |
-
"severity": severity,
|
| 538 |
-
"impact_type": "risk",
|
| 539 |
-
"timestamp": timestamp
|
| 540 |
-
})
|
| 541 |
|
| 542 |
-
#
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
review_text = review.get("text", "") or review.get("title", "")
|
| 548 |
-
if not review_text or len(review_text) < 10:
|
| 549 |
continue
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
"
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
domain_insights.append({
|
| 563 |
"source_event_id": str(uuid.uuid4()),
|
| 564 |
"structured_data": structured_feeds,
|
| 565 |
"domain": "intelligence",
|
| 566 |
-
"summary": f"Business Intelligence Summary: {llm_summary[:300]}",
|
| 567 |
"severity": "medium",
|
| 568 |
-
"impact_type": "risk"
|
|
|
|
| 569 |
})
|
| 570 |
|
| 571 |
-
print(f" ✓ Created {len(domain_insights)} intelligence insights")
|
| 572 |
|
| 573 |
return {
|
| 574 |
"final_feed": bulletin,
|
|
|
|
| 396 |
|
| 397 |
def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
|
| 398 |
"""
|
| 399 |
+
Generate competitive intelligence summary AND structured insights using LLM
|
| 400 |
"""
|
| 401 |
+
print("[MODULE 3B] Generating LLM Summary + Competitive Insights")
|
| 402 |
|
| 403 |
all_results = state.get("worker_results", [])
|
| 404 |
profile_feeds = state.get("profile_feeds", {})
|
| 405 |
competitor_feeds = state.get("competitor_feeds", {})
|
| 406 |
product_feeds = state.get("product_review_feeds", {})
|
| 407 |
|
| 408 |
+
llm_summary = "Competitive intelligence summary unavailable."
|
| 409 |
+
llm_insights = []
|
| 410 |
+
|
| 411 |
+
# Prepare summary data
|
| 412 |
summary_data = {
|
| 413 |
"total_results": len(all_results),
|
| 414 |
"profiles_monitored": list(profile_feeds.keys()),
|
|
|
|
| 418 |
"global_competitors": len(state.get("global_intel", []))
|
| 419 |
}
|
| 420 |
|
| 421 |
+
# Collect sample data for LLM analysis
|
| 422 |
+
sample_posts = []
|
| 423 |
+
for profile, posts in profile_feeds.items():
|
| 424 |
+
if isinstance(posts, list):
|
| 425 |
+
for p in posts[:2]:
|
| 426 |
+
text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
|
| 427 |
+
if text:
|
| 428 |
+
sample_posts.append(f"[PROFILE: {profile}] {text[:150]}")
|
|
|
|
|
|
|
| 429 |
|
| 430 |
+
for competitor, posts in competitor_feeds.items():
|
| 431 |
+
if isinstance(posts, list):
|
| 432 |
+
for p in posts[:2]:
|
| 433 |
+
text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
|
| 434 |
+
if text:
|
| 435 |
+
sample_posts.append(f"[COMPETITOR: {competitor}] {text[:150]}")
|
| 436 |
|
| 437 |
+
posts_text = "\n".join(sample_posts[:10]) if sample_posts else "No detailed data available"
|
|
|
|
| 438 |
|
| 439 |
+
prompt = f"""Analyze this competitive intelligence data and generate:
|
| 440 |
+
1. A strategic 3-sentence executive summary
|
| 441 |
+
2. Up to 5 unique business intelligence insights
|
| 442 |
+
|
| 443 |
+
Data Overview:
|
| 444 |
+
- Total intelligence: {summary_data['total_results']} items
|
| 445 |
+
- Competitors tracked: {', '.join(summary_data['competitors_tracked']) or 'None'}
|
| 446 |
+
- Products analyzed: {', '.join(summary_data['products_analyzed']) or 'None'}
|
| 447 |
+
|
| 448 |
+
Sample Data:
|
| 449 |
+
{posts_text}
|
| 450 |
+
|
| 451 |
+
Respond in this exact JSON format:
|
| 452 |
+
{{
|
| 453 |
+
"executive_summary": "Strategic 3-sentence summary of competitive landscape",
|
| 454 |
+
"insights": [
|
| 455 |
+
{{"summary": "Unique competitive insight #1", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
|
| 456 |
+
{{"summary": "Unique competitive insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
|
| 457 |
+
]
|
| 458 |
+
}}
|
| 459 |
+
|
| 460 |
+
Rules:
|
| 461 |
+
- Generate actionable business intelligence, not just data descriptions
|
| 462 |
+
- Identify competitive threats as "risk", business opportunities as "opportunity"
|
| 463 |
+
- Severity: high=urgent action needed, medium=monitor closely, low=informational
|
| 464 |
+
|
| 465 |
+
JSON only:"""
|
| 466 |
+
|
| 467 |
try:
|
| 468 |
response = self.llm.invoke(prompt)
|
| 469 |
+
content = response.content if hasattr(response, 'content') else str(response)
|
| 470 |
+
|
| 471 |
+
# Parse JSON response
|
| 472 |
+
import re
|
| 473 |
+
content = content.strip()
|
| 474 |
+
if content.startswith("```"):
|
| 475 |
+
content = re.sub(r'^```\w*\n?', '', content)
|
| 476 |
+
content = re.sub(r'\n?```$', '', content)
|
| 477 |
+
|
| 478 |
+
result = json.loads(content)
|
| 479 |
+
llm_summary = result.get("executive_summary", llm_summary)
|
| 480 |
+
llm_insights = result.get("insights", [])
|
| 481 |
+
|
| 482 |
+
print(f" ✓ LLM generated {len(llm_insights)} competitive insights")
|
| 483 |
+
|
| 484 |
+
except json.JSONDecodeError as e:
|
| 485 |
+
print(f" ⚠️ JSON parse error: {e}")
|
| 486 |
+
# Fallback to simple summary
|
| 487 |
+
try:
|
| 488 |
+
fallback_prompt = f"Summarize this competitive intelligence in 3 sentences:\n{posts_text[:1500]}"
|
| 489 |
+
response = self.llm.invoke(fallback_prompt)
|
| 490 |
+
llm_summary = response.content if hasattr(response, 'content') else str(response)
|
| 491 |
+
except:
|
| 492 |
+
pass
|
| 493 |
except Exception as e:
|
|
|
|
| 494 |
print(f" ⚠️ LLM error: {e}")
|
| 495 |
|
| 496 |
return {
|
| 497 |
"llm_summary": llm_summary,
|
| 498 |
+
"llm_insights": llm_insights,
|
| 499 |
"structured_output": summary_data
|
| 500 |
}
|
| 501 |
|
| 502 |
def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
|
| 503 |
"""
|
| 504 |
+
Module 3C: Format final competitive intelligence feed with LLM-enhanced insights
|
| 505 |
"""
|
| 506 |
print("[MODULE 3C] Formatting Final Output")
|
| 507 |
|
|
|
|
| 509 |
competitor_feeds = state.get("competitor_feeds", {})
|
| 510 |
product_feeds = state.get("product_review_feeds", {})
|
| 511 |
llm_summary = state.get("llm_summary", "No summary available")
|
| 512 |
+
llm_insights = state.get("llm_insights", []) # NEW: Get LLM-generated insights
|
| 513 |
local_intel = state.get("local_intel", [])
|
| 514 |
global_intel = state.get("global_intel", [])
|
| 515 |
|
|
|
|
| 543 |
"""
|
| 544 |
|
| 545 |
# Create integration output with structured data
|
|
|
|
| 546 |
structured_feeds = {
|
| 547 |
+
"profiles": profile_feeds,
|
| 548 |
+
"competitors": competitor_feeds,
|
| 549 |
+
"products": product_feeds,
|
| 550 |
"local_intel": local_intel,
|
| 551 |
"global_intel": global_intel
|
| 552 |
}
|
| 553 |
|
| 554 |
+
# Create list for domain_insights (FRONTEND COMPATIBLE)
|
| 555 |
domain_insights = []
|
| 556 |
timestamp = datetime.utcnow().isoformat()
|
| 557 |
|
| 558 |
+
# PRIORITY 1: Add LLM-generated unique insights (curated and actionable)
|
| 559 |
+
for insight in llm_insights:
|
| 560 |
+
if isinstance(insight, dict) and insight.get("summary"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
domain_insights.append({
|
| 562 |
"source_event_id": str(uuid.uuid4()),
|
| 563 |
"domain": "intelligence",
|
| 564 |
+
"summary": f"🎯 {insight.get('summary', '')}", # Mark as AI-analyzed
|
| 565 |
+
"severity": insight.get("severity", "medium"),
|
| 566 |
+
"impact_type": insight.get("impact_type", "risk"),
|
| 567 |
+
"timestamp": timestamp,
|
| 568 |
+
"is_llm_generated": True
|
| 569 |
})
|
| 570 |
|
| 571 |
+
print(f" ✓ Added {len(llm_insights)} LLM-generated competitive insights")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
+
# PRIORITY 2: Add raw data only as fallback if LLM didn't generate enough
|
| 574 |
+
if len(domain_insights) < 5:
|
| 575 |
+
# Add competitor insights as fallback
|
| 576 |
+
for competitor, posts in competitor_feeds.items():
|
| 577 |
+
if not isinstance(posts, list):
|
|
|
|
|
|
|
| 578 |
continue
|
| 579 |
+
for post in posts[:3]:
|
| 580 |
+
post_text = post.get("text", "") or post.get("title", "")
|
| 581 |
+
if not post_text or len(post_text) < 20:
|
| 582 |
+
continue
|
| 583 |
+
severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
|
| 584 |
+
domain_insights.append({
|
| 585 |
+
"source_event_id": str(uuid.uuid4()),
|
| 586 |
+
"domain": "intelligence",
|
| 587 |
+
"summary": f"Competitor ({competitor}): {post_text[:200]}",
|
| 588 |
+
"severity": severity,
|
| 589 |
+
"impact_type": "risk",
|
| 590 |
+
"timestamp": timestamp,
|
| 591 |
+
"is_llm_generated": False
|
| 592 |
+
})
|
| 593 |
+
|
| 594 |
+
# Add executive summary insight
|
| 595 |
domain_insights.append({
|
| 596 |
"source_event_id": str(uuid.uuid4()),
|
| 597 |
"structured_data": structured_feeds,
|
| 598 |
"domain": "intelligence",
|
| 599 |
+
"summary": f"📊 Business Intelligence Summary: {llm_summary[:300]}",
|
| 600 |
"severity": "medium",
|
| 601 |
+
"impact_type": "risk",
|
| 602 |
+
"is_llm_generated": True
|
| 603 |
})
|
| 604 |
|
| 605 |
+
print(f" ✓ Created {len(domain_insights)} total intelligence insights")
|
| 606 |
|
| 607 |
return {
|
| 608 |
"final_feed": bulletin,
|
src/nodes/socialAgentNode.py
CHANGED
|
@@ -404,45 +404,94 @@ class SocialAgentNode:
|
|
| 404 |
|
| 405 |
def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
|
| 406 |
"""
|
| 407 |
-
Module 3B: Use Groq LLM to generate executive summary
|
| 408 |
"""
|
| 409 |
-
print("[MODULE 3B] Generating LLM Summary")
|
| 410 |
|
| 411 |
structured_feeds = state.get("structured_output", {})
|
|
|
|
|
|
|
| 412 |
|
| 413 |
try:
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
- Asia Trending: {len(structured_feeds.get('asia', []))} items
|
| 419 |
-
- World Trending: {len(structured_feeds.get('world', []))} items
|
| 420 |
|
| 421 |
-
|
| 422 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
except Exception as e:
|
| 432 |
print(f" ⚠️ LLM Error: {e}")
|
| 433 |
-
llm_summary = "AI summary currently unavailable."
|
| 434 |
|
| 435 |
return {
|
| 436 |
-
"llm_summary": llm_summary
|
|
|
|
| 437 |
}
|
| 438 |
|
| 439 |
def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
|
| 440 |
"""
|
| 441 |
-
Module 3C: Format final feed output
|
| 442 |
"""
|
| 443 |
print("[MODULE 3C] Formatting Final Output")
|
| 444 |
|
| 445 |
llm_summary = state.get("llm_summary", "No summary available")
|
|
|
|
| 446 |
structured_feeds = state.get("structured_output", {})
|
| 447 |
|
| 448 |
trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
|
|
@@ -483,91 +532,81 @@ Monitoring social sentiment, trending topics, events, and people across:
|
|
| 483 |
Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
|
| 484 |
"""
|
| 485 |
|
| 486 |
-
# Create list for
|
| 487 |
domain_insights = []
|
| 488 |
timestamp = datetime.utcnow().isoformat()
|
| 489 |
|
| 490 |
-
#
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
-
#
|
| 508 |
-
|
| 509 |
-
for
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
"
|
| 523 |
-
"
|
| 524 |
-
|
| 525 |
-
"
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
"summary": f"Asia Regional: {post_text[:200]}",
|
| 540 |
-
"severity": "medium",
|
| 541 |
-
"impact_type": "risk",
|
| 542 |
-
"timestamp": timestamp
|
| 543 |
-
})
|
| 544 |
-
|
| 545 |
-
# 3. Create World insights
|
| 546 |
-
world_data = structured_feeds.get("world", [])
|
| 547 |
-
for post in world_data[:5]:
|
| 548 |
-
post_text = post.get("text", "") or post.get("title", "")
|
| 549 |
-
if not post_text or len(post_text) < 10:
|
| 550 |
-
continue
|
| 551 |
-
domain_insights.append({
|
| 552 |
-
"source_event_id": str(uuid.uuid4()),
|
| 553 |
-
"domain": "social",
|
| 554 |
-
"summary": f"Global: {post_text[:200]}",
|
| 555 |
-
"severity": "low",
|
| 556 |
-
"impact_type": "opportunity",
|
| 557 |
-
"timestamp": timestamp
|
| 558 |
-
})
|
| 559 |
-
|
| 560 |
-
# 4. Add executive summary insight
|
| 561 |
domain_insights.append({
|
| 562 |
"source_event_id": str(uuid.uuid4()),
|
| 563 |
"structured_data": structured_feeds,
|
| 564 |
"domain": "social",
|
| 565 |
-
"summary": f"
|
| 566 |
"severity": "medium",
|
| 567 |
-
"impact_type": "risk"
|
|
|
|
| 568 |
})
|
| 569 |
|
| 570 |
-
print(f" ✓ Created {len(domain_insights)} social intelligence insights")
|
| 571 |
|
| 572 |
return {
|
| 573 |
"final_feed": bulletin,
|
|
|
|
| 404 |
|
| 405 |
def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
|
| 406 |
"""
|
| 407 |
+
Module 3B: Use Groq LLM to generate executive summary AND structured insights
|
| 408 |
"""
|
| 409 |
+
print("[MODULE 3B] Generating LLM Summary + Structured Insights")
|
| 410 |
|
| 411 |
structured_feeds = state.get("structured_output", {})
|
| 412 |
+
llm_summary = "AI summary currently unavailable."
|
| 413 |
+
llm_insights = []
|
| 414 |
|
| 415 |
try:
|
| 416 |
+
# Collect sample posts for analysis
|
| 417 |
+
all_posts = []
|
| 418 |
+
for region, posts in structured_feeds.items():
|
| 419 |
+
for p in posts[:5]: # Top 5 per region
|
| 420 |
+
text = p.get("text", "") or p.get("title", "")
|
| 421 |
+
if text and len(text) > 20:
|
| 422 |
+
all_posts.append(f"[{region.upper()}] {text[:200]}")
|
| 423 |
+
|
| 424 |
+
if not all_posts:
|
| 425 |
+
return {"llm_summary": llm_summary, "llm_insights": []}
|
| 426 |
+
|
| 427 |
+
posts_text = "\n".join(all_posts[:15])
|
| 428 |
+
|
| 429 |
+
# Generate summary AND structured insights
|
| 430 |
+
analysis_prompt = f"""Analyze these social media posts from Sri Lanka and the region. Generate:
|
| 431 |
+
1. A 3-sentence executive summary of key trends
|
| 432 |
+
2. Up to 5 unique intelligence insights
|
| 433 |
|
| 434 |
+
Posts:
|
| 435 |
+
{posts_text}
|
|
|
|
|
|
|
| 436 |
|
| 437 |
+
Respond in this exact JSON format:
|
| 438 |
+
{{
|
| 439 |
+
"executive_summary": "Brief 3-sentence summary of key social trends and developments",
|
| 440 |
+
"insights": [
|
| 441 |
+
{{"summary": "Unique insight #1 (not copying post text)", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
|
| 442 |
+
{{"summary": "Unique insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
|
| 443 |
+
]
|
| 444 |
+
}}
|
| 445 |
|
| 446 |
+
Rules:
|
| 447 |
+
- Generate NEW insights, don't just copy post text
|
| 448 |
+
- Identify patterns and emerging trends
|
| 449 |
+
- Classify severity based on potential impact
|
| 450 |
+
- Mark positive developments as "opportunity", concerning ones as "risk"
|
| 451 |
|
| 452 |
+
JSON only, no explanation:"""
|
| 453 |
+
|
| 454 |
+
llm_response = self.llm.invoke(analysis_prompt)
|
| 455 |
+
content = llm_response.content if hasattr(llm_response, 'content') else str(llm_response)
|
| 456 |
+
|
| 457 |
+
# Parse JSON response
|
| 458 |
+
import re
|
| 459 |
+
content = content.strip()
|
| 460 |
+
if content.startswith("```"):
|
| 461 |
+
content = re.sub(r'^```\w*\n?', '', content)
|
| 462 |
+
content = re.sub(r'\n?```$', '', content)
|
| 463 |
|
| 464 |
+
result = json.loads(content)
|
| 465 |
+
llm_summary = result.get("executive_summary", llm_summary)
|
| 466 |
+
llm_insights = result.get("insights", [])
|
| 467 |
|
| 468 |
+
print(f" ✓ LLM generated {len(llm_insights)} unique insights")
|
| 469 |
+
|
| 470 |
+
except json.JSONDecodeError as e:
|
| 471 |
+
print(f" ⚠️ JSON parse error: {e}")
|
| 472 |
+
# Fallback to simple summary
|
| 473 |
+
try:
|
| 474 |
+
fallback_prompt = f"Summarize these social media trends in 3 sentences:\n{posts_text[:1500]}"
|
| 475 |
+
response = self.llm.invoke(fallback_prompt)
|
| 476 |
+
llm_summary = response.content if hasattr(response, 'content') else str(response)
|
| 477 |
+
except:
|
| 478 |
+
pass
|
| 479 |
except Exception as e:
|
| 480 |
print(f" ⚠️ LLM Error: {e}")
|
|
|
|
| 481 |
|
| 482 |
return {
|
| 483 |
+
"llm_summary": llm_summary,
|
| 484 |
+
"llm_insights": llm_insights
|
| 485 |
}
|
| 486 |
|
| 487 |
def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
|
| 488 |
"""
|
| 489 |
+
Module 3C: Format final feed output with LLM-enhanced insights
|
| 490 |
"""
|
| 491 |
print("[MODULE 3C] Formatting Final Output")
|
| 492 |
|
| 493 |
llm_summary = state.get("llm_summary", "No summary available")
|
| 494 |
+
llm_insights = state.get("llm_insights", []) # NEW: Get LLM-generated insights
|
| 495 |
structured_feeds = state.get("structured_output", {})
|
| 496 |
|
| 497 |
trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
|
|
|
|
| 532 |
Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
|
| 533 |
"""
|
| 534 |
|
| 535 |
+
# Create list for domain_insights (FRONTEND COMPATIBLE)
|
| 536 |
domain_insights = []
|
| 537 |
timestamp = datetime.utcnow().isoformat()
|
| 538 |
|
| 539 |
+
# PRIORITY 1: Add LLM-generated unique insights (these are curated and unique)
|
| 540 |
+
for insight in llm_insights:
|
| 541 |
+
if isinstance(insight, dict) and insight.get("summary"):
|
| 542 |
+
domain_insights.append({
|
| 543 |
+
"source_event_id": str(uuid.uuid4()),
|
| 544 |
+
"domain": "social",
|
| 545 |
+
"summary": f"🔍 {insight.get('summary', '')}", # Mark as AI-analyzed
|
| 546 |
+
"severity": insight.get("severity", "medium"),
|
| 547 |
+
"impact_type": insight.get("impact_type", "risk"),
|
| 548 |
+
"timestamp": timestamp,
|
| 549 |
+
"is_llm_generated": True # Flag for frontend
|
| 550 |
+
})
|
| 551 |
|
| 552 |
+
print(f" ✓ Added {len(llm_insights)} LLM-generated insights")
|
| 553 |
+
|
| 554 |
+
# PRIORITY 2: Add top raw posts only if we need more (fallback)
|
| 555 |
+
# Only add raw posts if LLM didn't generate enough insights
|
| 556 |
+
if len(domain_insights) < 5:
|
| 557 |
+
# Sri Lankan districts for geographic tagging
|
| 558 |
+
districts = [
|
| 559 |
+
"colombo", "gampaha", "kalutara", "kandy", "matale",
|
| 560 |
+
"nuwara eliya", "galle", "matara", "hambantota",
|
| 561 |
+
"jaffna", "kilinochchi", "mannar", "mullaitivu", "vavuniya",
|
| 562 |
+
"puttalam", "kurunegala", "anuradhapura", "polonnaruwa",
|
| 563 |
+
"badulla", "monaragala", "ratnapura", "kegalle",
|
| 564 |
+
"ampara", "batticaloa", "trincomalee"
|
| 565 |
+
]
|
| 566 |
|
| 567 |
+
# Add Sri Lanka posts as fallback
|
| 568 |
+
sri_lanka_data = structured_feeds.get("sri lanka", [])
|
| 569 |
+
for post in sri_lanka_data[:5]:
|
| 570 |
+
post_text = post.get("text", "") or post.get("title", "")
|
| 571 |
+
if not post_text or len(post_text) < 20:
|
| 572 |
+
continue
|
| 573 |
+
|
| 574 |
+
# Detect district
|
| 575 |
+
detected_district = "Sri Lanka"
|
| 576 |
+
for district in districts:
|
| 577 |
+
if district.lower() in post_text.lower():
|
| 578 |
+
detected_district = district.title()
|
| 579 |
+
break
|
| 580 |
+
|
| 581 |
+
# Determine severity
|
| 582 |
+
severity = "low"
|
| 583 |
+
if any(kw in post_text.lower() for kw in ["protest", "riot", "emergency", "violence", "crisis"]):
|
| 584 |
+
severity = "high"
|
| 585 |
+
elif any(kw in post_text.lower() for kw in ["trending", "viral", "breaking", "update"]):
|
| 586 |
+
severity = "medium"
|
| 587 |
+
|
| 588 |
+
domain_insights.append({
|
| 589 |
+
"source_event_id": str(uuid.uuid4()),
|
| 590 |
+
"domain": "social",
|
| 591 |
+
"summary": f"{detected_district}: {post_text[:200]}",
|
| 592 |
+
"severity": severity,
|
| 593 |
+
"impact_type": "risk" if severity in ["high", "medium"] else "opportunity",
|
| 594 |
+
"timestamp": timestamp,
|
| 595 |
+
"is_llm_generated": False
|
| 596 |
+
})
|
| 597 |
+
|
| 598 |
+
# Add executive summary insight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
domain_insights.append({
|
| 600 |
"source_event_id": str(uuid.uuid4()),
|
| 601 |
"structured_data": structured_feeds,
|
| 602 |
"domain": "social",
|
| 603 |
+
"summary": f"📊 Social Intelligence Summary: {llm_summary[:300]}",
|
| 604 |
"severity": "medium",
|
| 605 |
+
"impact_type": "risk",
|
| 606 |
+
"is_llm_generated": True
|
| 607 |
})
|
| 608 |
|
| 609 |
+
print(f" ✓ Created {len(domain_insights)} total social intelligence insights")
|
| 610 |
|
| 611 |
return {
|
| 612 |
"final_feed": bulletin,
|
src/utils/utils.py
CHANGED
|
@@ -439,21 +439,15 @@ def scrape_rivernet_impl(
|
|
| 439 |
main_html = page.content()
|
| 440 |
main_soup = BeautifulSoup(main_html, "html.parser")
|
| 441 |
|
| 442 |
-
#
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
if len(alert_text) > 20 and alert_text not in [a.get("text") for a in results["alerts"]]:
|
| 452 |
-
results["alerts"].append({
|
| 453 |
-
"text": alert_text,
|
| 454 |
-
"severity": "high" if keyword in ["danger", "flood"] else "medium",
|
| 455 |
-
"source": "rivernet.lk main page"
|
| 456 |
-
})
|
| 457 |
|
| 458 |
logger.info("[RIVERNET] Main page loaded successfully")
|
| 459 |
|
|
@@ -518,26 +512,43 @@ def scrape_rivernet_impl(
|
|
| 518 |
except (ValueError, IndexError):
|
| 519 |
continue
|
| 520 |
|
| 521 |
-
# Determine status based on keywords (
|
| 522 |
text_lower = page_text.lower()
|
| 523 |
|
| 524 |
-
# Default to normal
|
| 525 |
river_data["status"] = "normal"
|
| 526 |
|
| 527 |
-
#
|
| 528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
river_data["status"] = "danger"
|
| 530 |
|
| 531 |
-
# WARNING
|
| 532 |
-
elif any(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
river_data["status"] = "warning"
|
| 534 |
|
| 535 |
-
# RISING
|
| 536 |
-
elif any(
|
|
|
|
|
|
|
|
|
|
| 537 |
river_data["status"] = "rising"
|
| 538 |
|
| 539 |
-
#
|
| 540 |
-
elif any(
|
|
|
|
|
|
|
| 541 |
river_data["status"] = "normal"
|
| 542 |
|
| 543 |
results["rivers"].append(river_data)
|
|
|
|
| 439 |
main_html = page.content()
|
| 440 |
main_soup = BeautifulSoup(main_html, "html.parser")
|
| 441 |
|
| 442 |
+
# NOTE: Disabled loose keyword extraction - was causing false positives
|
| 443 |
+
# Real flood alerts will be determined from individual river page status
|
| 444 |
+
# The previous alert_keywords approach matched generic site text like
|
| 445 |
+
# "warning: javascript required" causing fake alerts
|
| 446 |
+
|
| 447 |
+
# If we need main page alerts, look for specific alert banner elements
|
| 448 |
+
# alert_banners = main_soup.select(".alert-banner, .flood-warning, .critical-notice")
|
| 449 |
+
# for banner in alert_banners:
|
| 450 |
+
# results["alerts"].append({...})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
logger.info("[RIVERNET] Main page loaded successfully")
|
| 453 |
|
|
|
|
| 512 |
except (ValueError, IndexError):
|
| 513 |
continue
|
| 514 |
|
| 515 |
+
# Determine status based on keywords (STRICTER to avoid false positives)
|
| 516 |
text_lower = page_text.lower()
|
| 517 |
|
| 518 |
+
# Default to normal - only escalate if clear flood indicators
|
| 519 |
river_data["status"] = "normal"
|
| 520 |
|
| 521 |
+
# CRITICAL: Only consider keywords in FLOOD CONTEXT
|
| 522 |
+
# Look for phrases, not just words, to avoid false positives
|
| 523 |
+
|
| 524 |
+
# DANGER / CRITICAL - Very specific phrases only
|
| 525 |
+
danger_phrases = [
|
| 526 |
+
"major flood", "danger level exceeded", "critical flood",
|
| 527 |
+
"red alert", "evacuate immediately", "extreme flood",
|
| 528 |
+
"water level exceeds danger", "above danger level"
|
| 529 |
+
]
|
| 530 |
+
if any(phrase in text_lower for phrase in danger_phrases):
|
| 531 |
river_data["status"] = "danger"
|
| 532 |
|
| 533 |
+
# WARNING - Specific flood warning phrases
|
| 534 |
+
elif any(phrase in text_lower for phrase in [
|
| 535 |
+
"minor flood", "warning level exceeded", "flood alert issued",
|
| 536 |
+
"amber alert", "approaching warning level",
|
| 537 |
+
"water level exceeds warning", "above warning level"
|
| 538 |
+
]):
|
| 539 |
river_data["status"] = "warning"
|
| 540 |
|
| 541 |
+
# RISING - Only if explicitly rising
|
| 542 |
+
elif any(phrase in text_lower for phrase in [
|
| 543 |
+
"water level rising", "rising trend detected",
|
| 544 |
+
"level is rising rapidly", "increasing water level"
|
| 545 |
+
]):
|
| 546 |
river_data["status"] = "rising"
|
| 547 |
|
| 548 |
+
# NORMAL indicators (optional, just for logging)
|
| 549 |
+
elif any(phrase in text_lower for phrase in [
|
| 550 |
+
"normal level", "stable", "safe level", "decreasing", "below warning"
|
| 551 |
+
]):
|
| 552 |
river_data["status"] = "normal"
|
| 553 |
|
| 554 |
results["rivers"].append(river_data)
|