Upload folder using huggingface_hub
Browse files- airflow/Dockerfile +16 -0
- airflow/README.md +69 -0
- airflow/airflow.env.example +36 -0
- airflow/dags/anomaly_detection_dag.py +73 -0
- airflow/dags/currency_prediction_dag.py +73 -0
- airflow/dags/stock_prediction_dag.py +73 -0
- airflow/dags/weather_prediction_dag.py +74 -0
- airflow/requirements.txt +23 -0
- frontend/app/components/dashboard/DashboardOverview.tsx +37 -23
- main.py +25 -6
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/feature_store/stock_data.csv +0 -0
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/ingested/test.csv +0 -0
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/ingested/train.csv +0 -0
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/drift_report/report.yaml +1 -1
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/validated/test.csv +0 -0
- models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/validated/train.csv +0 -0
- src/graphs/vectorizationAgentGraph.py +0 -1
- src/nodes/vectorizationAgentNode.py +90 -56
airflow/Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Astro CLI Dockerfile for ModelX-Ultimate Airflow
|
| 2 |
+
# Includes all dependencies for ML pipeline execution
|
| 3 |
+
|
| 4 |
+
FROM quay.io/astronomer/astro-runtime:12.3.0
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
USER root
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
build-essential \
|
| 10 |
+
libpq-dev \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
USER astro
|
| 13 |
+
|
| 14 |
+
# Copy requirements and install
|
| 15 |
+
COPY requirements.txt /tmp/requirements.txt
|
| 16 |
+
RUN pip install --no-cache-dir -r /tmp/requirements.txt
|
airflow/README.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModelX-Ultimate Astro Airflow
|
| 2 |
+
|
| 3 |
+
Centralized Apache Airflow setup using Astronomer's Astro CLI for managing all ML pipelines.
|
| 4 |
+
|
| 5 |
+
## DAGs Overview
|
| 6 |
+
|
| 7 |
+
| DAG | Schedule | Description |
|
| 8 |
+
|-----|----------|-------------|
|
| 9 |
+
| `weather_prediction_daily` | 4:00 AM IST | LSTM model for 25 Sri Lankan districts |
|
| 10 |
+
| `currency_prediction_daily` | 4:05 AM IST | GRU model for USD/LKR forex |
|
| 11 |
+
| `stock_prediction_daily` | 4:15 AM IST | BiLSTM models for 10 stocks |
|
| 12 |
+
| `anomaly_detection_periodic` | Every 6h | Anomaly detection retraining |
|
| 13 |
+
|
| 14 |
+
## Quick Start
|
| 15 |
+
|
| 16 |
+
### 1. Install Astro CLI
|
| 17 |
+
```bash
|
| 18 |
+
# macOS
|
| 19 |
+
brew install astro
|
| 20 |
+
|
| 21 |
+
# Windows (PowerShell as Admin)
|
| 22 |
+
winget install -e --id Astronomer.Astro
|
| 23 |
+
|
| 24 |
+
# Linux
|
| 25 |
+
curl -sSL install.astronomer.io | sudo bash -s
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### 2. Start Airflow
|
| 29 |
+
```bash
|
| 30 |
+
cd airflow
|
| 31 |
+
astro dev start
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
### 3. Access Airflow UI
|
| 35 |
+
- URL: http://localhost:8080
|
| 36 |
+
- Username: `admin`
|
| 37 |
+
- Password: `admin`
|
| 38 |
+
|
| 39 |
+
### 4. Enable DAGs
|
| 40 |
+
Turn on the DAGs in the Airflow UI to start scheduled runs.
|
| 41 |
+
|
| 42 |
+
## Directory Structure
|
| 43 |
+
```
|
| 44 |
+
airflow/
|
| 45 |
+
├── dags/
|
| 46 |
+
│ ├── weather_prediction_dag.py
|
| 47 |
+
│ ├── currency_prediction_dag.py
|
| 48 |
+
│ ├── stock_prediction_dag.py
|
| 49 |
+
│ └── anomaly_detection_dag.py
|
| 50 |
+
├── Dockerfile
|
| 51 |
+
├── requirements.txt
|
| 52 |
+
├── airflow.env.example
|
| 53 |
+
└── README.md
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Manual Trigger
|
| 57 |
+
```bash
|
| 58 |
+
# Trigger a specific DAG
|
| 59 |
+
astro dev run dags trigger weather_prediction_daily
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## Logs
|
| 63 |
+
```bash
|
| 64 |
+
# View scheduler logs
|
| 65 |
+
astro dev logs --scheduler
|
| 66 |
+
|
| 67 |
+
# View webserver logs
|
| 68 |
+
astro dev logs --webserver
|
| 69 |
+
```
|
airflow/airflow.env.example
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModelX-Ultimate Airflow Configuration
|
| 2 |
+
# =====================================
|
| 3 |
+
# Centralized Astro Apache Airflow setup for all ML pipelines
|
| 4 |
+
#
|
| 5 |
+
# Pipelines managed:
|
| 6 |
+
# - Weather Prediction (4:00 AM IST daily)
|
| 7 |
+
# - Currency Prediction (4:00 AM IST daily)
|
| 8 |
+
# - Stock Prediction (4:00 AM IST daily)
|
| 9 |
+
# - Anomaly Detection (continuous)
|
| 10 |
+
|
| 11 |
+
# Airflow settings
|
| 12 |
+
AIRFLOW_UID=50000
|
| 13 |
+
AIRFLOW_GID=0
|
| 14 |
+
|
| 15 |
+
# Environment
|
| 16 |
+
AIRFLOW__CORE__EXECUTOR=LocalExecutor
|
| 17 |
+
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
|
| 18 |
+
AIRFLOW__CORE__FERNET_KEY=
|
| 19 |
+
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
|
| 20 |
+
AIRFLOW__CORE__LOAD_EXAMPLES=false
|
| 21 |
+
AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
|
| 22 |
+
|
| 23 |
+
# Webserver
|
| 24 |
+
AIRFLOW__WEBSERVER__SECRET_KEY=modelx-secret-key-change-in-production
|
| 25 |
+
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=false
|
| 26 |
+
|
| 27 |
+
# Scheduler
|
| 28 |
+
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK=true
|
| 29 |
+
AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=30
|
| 30 |
+
|
| 31 |
+
# Timezone
|
| 32 |
+
AIRFLOW__CORE__DEFAULT_TIMEZONE=Asia/Colombo
|
| 33 |
+
|
| 34 |
+
# Project paths
|
| 35 |
+
PROJECT_ROOT=/opt/airflow
|
| 36 |
+
MODELS_PATH=/opt/airflow/models
|
airflow/dags/anomaly_detection_dag.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Anomaly Detection DAG
|
| 3 |
+
Runs every 6 hours
|
| 4 |
+
Retrains anomaly detection model on latest data
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import subprocess
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from airflow import DAG
|
| 13 |
+
from airflow.operators.python import PythonOperator
|
| 14 |
+
|
| 15 |
+
# Project paths
|
| 16 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 17 |
+
ANOMALY_MODEL_PATH = PROJECT_ROOT / "models" / "anomaly-detection"
|
| 18 |
+
|
| 19 |
+
default_args = {
|
| 20 |
+
"owner": "modelx",
|
| 21 |
+
"depends_on_past": False,
|
| 22 |
+
"email_on_failure": False,
|
| 23 |
+
"email_on_retry": False,
|
| 24 |
+
"retries": 1,
|
| 25 |
+
"retry_delay": timedelta(minutes=5),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def run_anomaly_training(**context):
|
| 30 |
+
"""Run the anomaly detection training pipeline."""
|
| 31 |
+
main_py = ANOMALY_MODEL_PATH / "main.py"
|
| 32 |
+
|
| 33 |
+
if not main_py.exists():
|
| 34 |
+
raise FileNotFoundError(f"Anomaly training script not found: {main_py}")
|
| 35 |
+
|
| 36 |
+
result = subprocess.run(
|
| 37 |
+
[sys.executable, str(main_py)],
|
| 38 |
+
capture_output=True,
|
| 39 |
+
text=True,
|
| 40 |
+
cwd=str(ANOMALY_MODEL_PATH)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
print("STDOUT:", result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout)
|
| 44 |
+
if result.stderr:
|
| 45 |
+
print("STDERR:", result.stderr[-1000:])
|
| 46 |
+
|
| 47 |
+
if result.returncode != 0:
|
| 48 |
+
raise Exception(f"Anomaly training failed with exit code {result.returncode}")
|
| 49 |
+
|
| 50 |
+
return True
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
with DAG(
|
| 54 |
+
dag_id="anomaly_detection_periodic",
|
| 55 |
+
default_args=default_args,
|
| 56 |
+
description="Periodic anomaly detection model retraining",
|
| 57 |
+
schedule_interval="0 */6 * * *", # Every 6 hours
|
| 58 |
+
start_date=datetime(2024, 12, 1),
|
| 59 |
+
catchup=False,
|
| 60 |
+
tags=["anomaly", "ml", "detection", "periodic"],
|
| 61 |
+
max_active_runs=1,
|
| 62 |
+
) as dag:
|
| 63 |
+
|
| 64 |
+
train_anomaly = PythonOperator(
|
| 65 |
+
task_id="train_anomaly_model",
|
| 66 |
+
python_callable=run_anomaly_training,
|
| 67 |
+
provide_context=True,
|
| 68 |
+
execution_timeout=timedelta(minutes=30),
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
print(f"Anomaly Detection DAG - Schedule: Every 6 hours")
|
airflow/dags/currency_prediction_dag.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Currency Prediction DAG
|
| 3 |
+
Runs daily at 4:00 AM IST (22:30 UTC)
|
| 4 |
+
Trains GRU model for USD/LKR exchange rate prediction
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import subprocess
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from airflow import DAG
|
| 13 |
+
from airflow.operators.python import PythonOperator
|
| 14 |
+
|
| 15 |
+
# Project paths
|
| 16 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 17 |
+
CURRENCY_MODEL_PATH = PROJECT_ROOT / "models" / "currency-volatility-prediction"
|
| 18 |
+
|
| 19 |
+
default_args = {
|
| 20 |
+
"owner": "modelx",
|
| 21 |
+
"depends_on_past": False,
|
| 22 |
+
"email_on_failure": False,
|
| 23 |
+
"email_on_retry": False,
|
| 24 |
+
"retries": 2,
|
| 25 |
+
"retry_delay": timedelta(minutes=10),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def run_currency_training(**context):
|
| 30 |
+
"""Run the currency prediction training pipeline."""
|
| 31 |
+
main_py = CURRENCY_MODEL_PATH / "main.py"
|
| 32 |
+
|
| 33 |
+
if not main_py.exists():
|
| 34 |
+
raise FileNotFoundError(f"Currency training script not found: {main_py}")
|
| 35 |
+
|
| 36 |
+
result = subprocess.run(
|
| 37 |
+
[sys.executable, str(main_py), "--mode", "full"],
|
| 38 |
+
capture_output=True,
|
| 39 |
+
text=True,
|
| 40 |
+
cwd=str(CURRENCY_MODEL_PATH)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
print("STDOUT:", result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout)
|
| 44 |
+
if result.stderr:
|
| 45 |
+
print("STDERR:", result.stderr[-1000:])
|
| 46 |
+
|
| 47 |
+
if result.returncode != 0:
|
| 48 |
+
raise Exception(f"Currency training failed with exit code {result.returncode}")
|
| 49 |
+
|
| 50 |
+
return True
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
with DAG(
|
| 54 |
+
dag_id="currency_prediction_daily",
|
| 55 |
+
default_args=default_args,
|
| 56 |
+
description="Daily USD/LKR exchange rate prediction using GRU model",
|
| 57 |
+
schedule_interval="35 22 * * *", # 4:05 AM IST = 22:35 UTC (staggered)
|
| 58 |
+
start_date=datetime(2024, 12, 1),
|
| 59 |
+
catchup=False,
|
| 60 |
+
tags=["currency", "ml", "prediction", "gru", "daily", "forex"],
|
| 61 |
+
max_active_runs=1,
|
| 62 |
+
) as dag:
|
| 63 |
+
|
| 64 |
+
train_currency = PythonOperator(
|
| 65 |
+
task_id="train_currency_model",
|
| 66 |
+
python_callable=run_currency_training,
|
| 67 |
+
provide_context=True,
|
| 68 |
+
execution_timeout=timedelta(hours=1),
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
print(f"Currency Prediction DAG - Schedule: 4:05 AM IST daily")
|
airflow/dags/stock_prediction_dag.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stock Prediction DAG
|
| 3 |
+
Runs daily at 4:15 AM IST (22:45 UTC)
|
| 4 |
+
Trains BiLSTM models for 10 stocks
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import subprocess
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from airflow import DAG
|
| 13 |
+
from airflow.operators.python import PythonOperator
|
| 14 |
+
|
| 15 |
+
# Project paths
|
| 16 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 17 |
+
STOCK_MODEL_PATH = PROJECT_ROOT / "models" / "stock-price-prediction"
|
| 18 |
+
|
| 19 |
+
default_args = {
|
| 20 |
+
"owner": "modelx",
|
| 21 |
+
"depends_on_past": False,
|
| 22 |
+
"email_on_failure": False,
|
| 23 |
+
"email_on_retry": False,
|
| 24 |
+
"retries": 2,
|
| 25 |
+
"retry_delay": timedelta(minutes=10),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def run_stock_training(**context):
|
| 30 |
+
"""Run the multi-stock training pipeline."""
|
| 31 |
+
main_py = STOCK_MODEL_PATH / "main.py"
|
| 32 |
+
|
| 33 |
+
if not main_py.exists():
|
| 34 |
+
raise FileNotFoundError(f"Stock training script not found: {main_py}")
|
| 35 |
+
|
| 36 |
+
result = subprocess.run(
|
| 37 |
+
[sys.executable, str(main_py)],
|
| 38 |
+
capture_output=True,
|
| 39 |
+
text=True,
|
| 40 |
+
cwd=str(STOCK_MODEL_PATH)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
print("STDOUT:", result.stdout[-5000:] if len(result.stdout) > 5000 else result.stdout)
|
| 44 |
+
if result.stderr:
|
| 45 |
+
print("STDERR:", result.stderr[-2000:])
|
| 46 |
+
|
| 47 |
+
if result.returncode != 0:
|
| 48 |
+
raise Exception(f"Stock training failed with exit code {result.returncode}")
|
| 49 |
+
|
| 50 |
+
return True
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
with DAG(
|
| 54 |
+
dag_id="stock_prediction_daily",
|
| 55 |
+
default_args=default_args,
|
| 56 |
+
description="Daily stock prediction for 10 stocks using BiLSTM",
|
| 57 |
+
schedule_interval="45 22 * * *", # 4:15 AM IST = 22:45 UTC (staggered)
|
| 58 |
+
start_date=datetime(2024, 12, 1),
|
| 59 |
+
catchup=False,
|
| 60 |
+
tags=["stock", "ml", "prediction", "lstm", "daily"],
|
| 61 |
+
max_active_runs=1,
|
| 62 |
+
) as dag:
|
| 63 |
+
|
| 64 |
+
train_stocks = PythonOperator(
|
| 65 |
+
task_id="train_all_stocks",
|
| 66 |
+
python_callable=run_stock_training,
|
| 67 |
+
provide_context=True,
|
| 68 |
+
execution_timeout=timedelta(hours=4), # 10 stocks take time
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
print(f"Stock Prediction DAG - Schedule: 4:15 AM IST daily")
|
airflow/dags/weather_prediction_dag.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Weather Prediction DAG
|
| 3 |
+
Runs daily at 4:00 AM IST (22:30 UTC)
|
| 4 |
+
Trains LSTM model for 25 Sri Lankan districts
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import subprocess
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from airflow import DAG
|
| 13 |
+
from airflow.operators.python import PythonOperator
|
| 14 |
+
from airflow.operators.bash import BashOperator
|
| 15 |
+
|
| 16 |
+
# Project paths
|
| 17 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 18 |
+
WEATHER_MODEL_PATH = PROJECT_ROOT / "models" / "weather-prediction"
|
| 19 |
+
|
| 20 |
+
default_args = {
|
| 21 |
+
"owner": "modelx",
|
| 22 |
+
"depends_on_past": False,
|
| 23 |
+
"email_on_failure": False,
|
| 24 |
+
"email_on_retry": False,
|
| 25 |
+
"retries": 2,
|
| 26 |
+
"retry_delay": timedelta(minutes=10),
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_weather_training(**context):
|
| 31 |
+
"""Run the weather prediction training pipeline."""
|
| 32 |
+
main_py = WEATHER_MODEL_PATH / "main.py"
|
| 33 |
+
|
| 34 |
+
if not main_py.exists():
|
| 35 |
+
raise FileNotFoundError(f"Weather training script not found: {main_py}")
|
| 36 |
+
|
| 37 |
+
result = subprocess.run(
|
| 38 |
+
[sys.executable, str(main_py), "--mode", "full"],
|
| 39 |
+
capture_output=True,
|
| 40 |
+
text=True,
|
| 41 |
+
cwd=str(WEATHER_MODEL_PATH)
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print("STDOUT:", result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout)
|
| 45 |
+
if result.stderr:
|
| 46 |
+
print("STDERR:", result.stderr[-1000:])
|
| 47 |
+
|
| 48 |
+
if result.returncode != 0:
|
| 49 |
+
raise Exception(f"Weather training failed with exit code {result.returncode}")
|
| 50 |
+
|
| 51 |
+
return True
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
with DAG(
|
| 55 |
+
dag_id="weather_prediction_daily",
|
| 56 |
+
default_args=default_args,
|
| 57 |
+
description="Daily weather prediction model training for 25 Sri Lankan districts",
|
| 58 |
+
schedule_interval="30 22 * * *", # 4:00 AM IST = 22:30 UTC
|
| 59 |
+
start_date=datetime(2024, 12, 1),
|
| 60 |
+
catchup=False,
|
| 61 |
+
tags=["weather", "ml", "prediction", "lstm", "daily"],
|
| 62 |
+
max_active_runs=1,
|
| 63 |
+
) as dag:
|
| 64 |
+
|
| 65 |
+
train_weather = PythonOperator(
|
| 66 |
+
task_id="train_weather_model",
|
| 67 |
+
python_callable=run_weather_training,
|
| 68 |
+
provide_context=True,
|
| 69 |
+
execution_timeout=timedelta(hours=2),
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
print(f"Weather Prediction DAG - Schedule: 4:00 AM IST daily")
|
airflow/requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModelX-Ultimate Airflow Requirements
|
| 2 |
+
# ML Pipeline Dependencies
|
| 3 |
+
|
| 4 |
+
# Core ML
|
| 5 |
+
tensorflow>=2.15.0
|
| 6 |
+
scikit-learn>=1.3.0
|
| 7 |
+
numpy>=1.24.0
|
| 8 |
+
pandas>=2.0.0
|
| 9 |
+
|
| 10 |
+
# Stock data
|
| 11 |
+
yfinance>=0.2.36
|
| 12 |
+
|
| 13 |
+
# MLflow tracking
|
| 14 |
+
mlflow>=2.9.0
|
| 15 |
+
|
| 16 |
+
# Data processing
|
| 17 |
+
joblib>=1.3.0
|
| 18 |
+
|
| 19 |
+
# HTTP requests (for API calls)
|
| 20 |
+
requests>=2.31.0
|
| 21 |
+
|
| 22 |
+
# Environment management
|
| 23 |
+
python-dotenv>=1.0.0
|
frontend/app/components/dashboard/DashboardOverview.tsx
CHANGED
|
@@ -190,29 +190,43 @@ const DashboardOverview = () => {
|
|
| 190 |
</div>
|
| 191 |
</Card>
|
| 192 |
|
| 193 |
-
{/* Operational Risk
|
| 194 |
-
|
| 195 |
-
<
|
| 196 |
-
<
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
<
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
</div>
|
| 217 |
);
|
| 218 |
};
|
|
|
|
| 190 |
</div>
|
| 191 |
</Card>
|
| 192 |
|
| 193 |
+
{/* Operational Risk Indicators - Computed from Events */}
|
| 194 |
+
{sortedEvents.length > 0 && (
|
| 195 |
+
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4">
|
| 196 |
+
<Card className="p-6 bg-card border-border">
|
| 197 |
+
<Cloud className="w-8 h-8 text-warning mb-3" />
|
| 198 |
+
<p className="text-2xl font-bold">
|
| 199 |
+
{Math.min(100, Math.round(
|
| 200 |
+
(sortedEvents.filter(e => e.domain === 'meteorological' || e.summary?.toLowerCase().includes('weather')).length / Math.max(sortedEvents.length, 1)) * 100 * 3
|
| 201 |
+
))}%
|
| 202 |
+
</p>
|
| 203 |
+
<p className="text-xs text-muted-foreground uppercase">Weather Impact</p>
|
| 204 |
+
</Card>
|
| 205 |
+
<Card className="p-6 bg-card border-border">
|
| 206 |
+
<AlertTriangle className="w-8 h-8 text-destructive mb-3" />
|
| 207 |
+
<p className="text-2xl font-bold">
|
| 208 |
+
{Math.round((criticalEvents.length / Math.max(sortedEvents.length, 1)) * 100)}%
|
| 209 |
+
</p>
|
| 210 |
+
<p className="text-xs text-muted-foreground uppercase">Critical Risk Level</p>
|
| 211 |
+
</Card>
|
| 212 |
+
<Card className="p-6 bg-card border-border">
|
| 213 |
+
<TrendingUp className="w-8 h-8 text-info mb-3" />
|
| 214 |
+
<p className="text-2xl font-bold">
|
| 215 |
+
{Math.min(100, Math.round(
|
| 216 |
+
(sortedEvents.filter(e => e.domain === 'economical' || e.domain === 'market').length / Math.max(sortedEvents.length, 1)) * 100 * 3
|
| 217 |
+
))}%
|
| 218 |
+
</p>
|
| 219 |
+
<p className="text-xs text-muted-foreground uppercase">Market Activity</p>
|
| 220 |
+
</Card>
|
| 221 |
+
<Card className="p-6 bg-card border-border">
|
| 222 |
+
<Building className="w-8 h-8 text-success mb-3" />
|
| 223 |
+
<p className="text-2xl font-bold">
|
| 224 |
+
{Math.round((opportunityEvents.length / Math.max(sortedEvents.length, 1)) * 100)}%
|
| 225 |
+
</p>
|
| 226 |
+
<p className="text-xs text-muted-foreground uppercase">Opportunity Index</p>
|
| 227 |
+
</Card>
|
| 228 |
+
</div>
|
| 229 |
+
)}
|
| 230 |
</div>
|
| 231 |
);
|
| 232 |
};
|
main.py
CHANGED
|
@@ -90,14 +90,13 @@ def check_and_train_models():
|
|
| 90 |
"name": "Stock Prediction",
|
| 91 |
"check_paths": [
|
| 92 |
PROJECT_ROOT / "models" / "stock-price-prediction"
|
| 93 |
-
/ "
|
| 94 |
],
|
| 95 |
-
"check_files": ["*.h5", "*.keras"],
|
| 96 |
"train_cmd": [
|
| 97 |
sys.executable,
|
| 98 |
str(PROJECT_ROOT / "models" / "stock-price-prediction"
|
| 99 |
-
/ "main.py")
|
| 100 |
-
"--mode", "full"
|
| 101 |
]
|
| 102 |
},
|
| 103 |
]
|
|
@@ -1710,9 +1709,29 @@ async def get_currency_prediction():
|
|
| 1710 |
predictor = get_currency_predictor()
|
| 1711 |
|
| 1712 |
if predictor is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1713 |
return {
|
| 1714 |
-
"status": "
|
| 1715 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1716 |
}
|
| 1717 |
|
| 1718 |
try:
|
|
|
|
| 90 |
"name": "Stock Prediction",
|
| 91 |
"check_paths": [
|
| 92 |
PROJECT_ROOT / "models" / "stock-price-prediction"
|
| 93 |
+
/ "Artifacts",
|
| 94 |
],
|
| 95 |
+
"check_files": ["*.pkl", "*.h5", "*.keras"],
|
| 96 |
"train_cmd": [
|
| 97 |
sys.executable,
|
| 98 |
str(PROJECT_ROOT / "models" / "stock-price-prediction"
|
| 99 |
+
/ "main.py")
|
|
|
|
| 100 |
]
|
| 101 |
},
|
| 102 |
]
|
|
|
|
| 1709 |
predictor = get_currency_predictor()
|
| 1710 |
|
| 1711 |
if predictor is None:
|
| 1712 |
+
# Generate fallback prediction inline
|
| 1713 |
+
import numpy as np
|
| 1714 |
+
current_rate = 298.0
|
| 1715 |
+
np.random.seed(int(datetime.now().timestamp()) % 2**31)
|
| 1716 |
+
change_pct = np.random.normal(0.05, 0.3)
|
| 1717 |
+
predicted_rate = current_rate * (1 + change_pct / 100)
|
| 1718 |
+
|
| 1719 |
return {
|
| 1720 |
+
"status": "success",
|
| 1721 |
+
"prediction": {
|
| 1722 |
+
"prediction_date": (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d"),
|
| 1723 |
+
"generated_at": datetime.now().isoformat(),
|
| 1724 |
+
"model_version": "fallback",
|
| 1725 |
+
"is_fallback": True,
|
| 1726 |
+
"current_rate": round(current_rate, 2),
|
| 1727 |
+
"predicted_rate": round(predicted_rate, 2),
|
| 1728 |
+
"expected_change": round(predicted_rate - current_rate, 2),
|
| 1729 |
+
"expected_change_pct": round(change_pct, 3),
|
| 1730 |
+
"direction": "strengthening" if change_pct < 0 else "weakening",
|
| 1731 |
+
"direction_emoji": "📈" if change_pct < 0 else "📉",
|
| 1732 |
+
"volatility_class": "low",
|
| 1733 |
+
"note": "Using fallback - model initializing"
|
| 1734 |
+
}
|
| 1735 |
}
|
| 1736 |
|
| 1737 |
try:
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/feature_store/stock_data.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/ingested/test.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_ingestion/ingested/train.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/drift_report/report.yaml
CHANGED
|
@@ -21,4 +21,4 @@ StockName:
|
|
| 21 |
p_value: 1.0
|
| 22 |
Volume:
|
| 23 |
drift_status: true
|
| 24 |
-
p_value:
|
|
|
|
| 21 |
p_value: 1.0
|
| 22 |
Volume:
|
| 23 |
drift_status: true
|
| 24 |
+
p_value: 9.305833669220032e-37
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/validated/test.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/stock-price-prediction/Artifacts/12_10_2025_03_04_56/data_validation/validated/train.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/graphs/vectorizationAgentGraph.py
CHANGED
|
@@ -77,4 +77,3 @@ graph = VectorizationGraphBuilder(llm).build_graph()
|
|
| 77 |
|
| 78 |
print("[OK] Vectorization Agent Graph compiled successfully")
|
| 79 |
print("=" * 60 + "\n")
|
| 80 |
-
|
|
|
|
| 77 |
|
| 78 |
print("[OK] Vectorization Agent Graph compiled successfully")
|
| 79 |
print("=" * 60 + "\n")
|
|
|
src/nodes/vectorizationAgentNode.py
CHANGED
|
@@ -389,7 +389,7 @@ class VectorizationAgentNode:
|
|
| 389 |
|
| 390 |
# For non-English languages, skip anomaly detection
|
| 391 |
# The ML model was trained on English embeddings only.
|
| 392 |
-
# Different BERT models (SinhalaBERTo, Tamil-BERT) have completely
|
| 393 |
# different embedding spaces - Tamil embeddings have magnitude ~0.64
|
| 394 |
# while English has ~7.5 and Sinhala ~13.7. They cannot be compared.
|
| 395 |
if language in ["sinhala", "tamil"]:
|
|
@@ -453,7 +453,7 @@ class VectorizationAgentNode:
|
|
| 453 |
def run_trending_detection(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 454 |
"""
|
| 455 |
Step 2.6: Detect trending topics from the input texts.
|
| 456 |
-
|
| 457 |
Extracts key entities/topics and tracks their mention velocity.
|
| 458 |
Identifies:
|
| 459 |
- Trending topics (momentum > 2x normal)
|
|
@@ -461,9 +461,9 @@ class VectorizationAgentNode:
|
|
| 461 |
- Topics with increasing momentum
|
| 462 |
"""
|
| 463 |
logger.info("[VectorizationAgent] STEP 2.6: Trending Detection")
|
| 464 |
-
|
| 465 |
detection_results = state.get("language_detection_results", [])
|
| 466 |
-
|
| 467 |
if not detection_results:
|
| 468 |
logger.warning("[VectorizationAgent] No texts for trending detection")
|
| 469 |
return {
|
|
@@ -475,7 +475,7 @@ class VectorizationAgentNode:
|
|
| 475 |
"spike_alerts": [],
|
| 476 |
},
|
| 477 |
}
|
| 478 |
-
|
| 479 |
# Import trending detector
|
| 480 |
try:
|
| 481 |
from src.utils.trending_detector import (
|
|
@@ -484,11 +484,12 @@ class VectorizationAgentNode:
|
|
| 484 |
get_trending_now,
|
| 485 |
get_spikes,
|
| 486 |
)
|
|
|
|
| 487 |
TRENDING_AVAILABLE = True
|
| 488 |
except ImportError as e:
|
| 489 |
logger.warning(f"[VectorizationAgent] Trending detector not available: {e}")
|
| 490 |
TRENDING_AVAILABLE = False
|
| 491 |
-
|
| 492 |
if not TRENDING_AVAILABLE:
|
| 493 |
return {
|
| 494 |
"current_step": "trending_detection",
|
|
@@ -499,19 +500,19 @@ class VectorizationAgentNode:
|
|
| 499 |
"spike_alerts": [],
|
| 500 |
},
|
| 501 |
}
|
| 502 |
-
|
| 503 |
# Extract entities and record mentions
|
| 504 |
entities_found = []
|
| 505 |
-
|
| 506 |
for item in detection_results:
|
| 507 |
text = item.get("text", "") # Field is 'text', not 'original_text'
|
| 508 |
language = item.get("language", "english")
|
| 509 |
post_id = item.get("post_id", "")
|
| 510 |
-
|
| 511 |
# Simple entity extraction (keywords, capitalized words, etc.)
|
| 512 |
# In production, you'd use NER or more sophisticated extraction
|
| 513 |
extracted = self._extract_entities(text, language)
|
| 514 |
-
|
| 515 |
for entity in extracted:
|
| 516 |
try:
|
| 517 |
# Record mention with trending detector
|
|
@@ -520,15 +521,17 @@ class VectorizationAgentNode:
|
|
| 520 |
source=entity.get("source", "feed"),
|
| 521 |
domain=entity.get("domain", "general"),
|
| 522 |
)
|
| 523 |
-
entities_found.append(
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
|
|
|
|
|
|
| 529 |
except Exception as e:
|
| 530 |
logger.debug(f"[VectorizationAgent] Failed to record mention: {e}")
|
| 531 |
-
|
| 532 |
# Get current trending topics and spikes
|
| 533 |
try:
|
| 534 |
trending_topics = get_trending_now(limit=10)
|
|
@@ -537,12 +540,12 @@ class VectorizationAgentNode:
|
|
| 537 |
logger.warning(f"[VectorizationAgent] Failed to get trending data: {e}")
|
| 538 |
trending_topics = []
|
| 539 |
spike_alerts = []
|
| 540 |
-
|
| 541 |
logger.info(
|
| 542 |
f"[VectorizationAgent] Trending detection: {len(entities_found)} entities, "
|
| 543 |
f"{len(trending_topics)} trending, {len(spike_alerts)} spikes"
|
| 544 |
)
|
| 545 |
-
|
| 546 |
return {
|
| 547 |
"current_step": "trending_detection",
|
| 548 |
"trending_results": {
|
|
@@ -553,65 +556,97 @@ class VectorizationAgentNode:
|
|
| 553 |
"spike_alerts": spike_alerts,
|
| 554 |
},
|
| 555 |
}
|
| 556 |
-
|
| 557 |
-
def _extract_entities(
|
|
|
|
|
|
|
| 558 |
"""
|
| 559 |
Extract entities/topics from text for trending tracking.
|
| 560 |
-
|
| 561 |
Uses simple heuristics:
|
| 562 |
- Capitalized words/phrases (potential proper nouns)
|
| 563 |
- Hashtags
|
| 564 |
- Common news keywords
|
| 565 |
-
|
| 566 |
In production, integrate with NER model for better extraction.
|
| 567 |
"""
|
| 568 |
entities = []
|
| 569 |
-
|
| 570 |
if not text:
|
| 571 |
return entities
|
| 572 |
-
|
| 573 |
import re
|
| 574 |
-
|
| 575 |
# Extract hashtags
|
| 576 |
-
hashtags = re.findall(r
|
| 577 |
for tag in hashtags:
|
| 578 |
-
entities.append(
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
|
|
|
|
|
|
| 585 |
# Extract capitalized phrases (potential proper nouns)
|
| 586 |
# Match 1-4 consecutive capitalized words
|
| 587 |
-
cap_phrases = re.findall(r
|
| 588 |
for phrase in cap_phrases:
|
| 589 |
# Skip common words
|
| 590 |
-
if phrase.lower() not in [
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
# News/event keywords
|
| 599 |
news_keywords = [
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
]
|
| 604 |
-
|
| 605 |
text_lower = text.lower()
|
| 606 |
for keyword in news_keywords:
|
| 607 |
if keyword in text_lower:
|
| 608 |
-
entities.append(
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
|
|
|
|
|
|
| 615 |
# Deduplicate by text
|
| 616 |
seen = set()
|
| 617 |
unique_entities = []
|
|
@@ -620,9 +655,8 @@ class VectorizationAgentNode:
|
|
| 620 |
if key not in seen:
|
| 621 |
seen.add(key)
|
| 622 |
unique_entities.append(e)
|
| 623 |
-
|
| 624 |
-
return unique_entities[:15] # Limit entities per text
|
| 625 |
|
|
|
|
| 626 |
|
| 627 |
def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 628 |
"""
|
|
|
|
| 389 |
|
| 390 |
# For non-English languages, skip anomaly detection
|
| 391 |
# The ML model was trained on English embeddings only.
|
| 392 |
+
# Different BERT models (SinhalaBERTo, Tamil-BERT) have completely
|
| 393 |
# different embedding spaces - Tamil embeddings have magnitude ~0.64
|
| 394 |
# while English has ~7.5 and Sinhala ~13.7. They cannot be compared.
|
| 395 |
if language in ["sinhala", "tamil"]:
|
|
|
|
| 453 |
def run_trending_detection(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 454 |
"""
|
| 455 |
Step 2.6: Detect trending topics from the input texts.
|
| 456 |
+
|
| 457 |
Extracts key entities/topics and tracks their mention velocity.
|
| 458 |
Identifies:
|
| 459 |
- Trending topics (momentum > 2x normal)
|
|
|
|
| 461 |
- Topics with increasing momentum
|
| 462 |
"""
|
| 463 |
logger.info("[VectorizationAgent] STEP 2.6: Trending Detection")
|
| 464 |
+
|
| 465 |
detection_results = state.get("language_detection_results", [])
|
| 466 |
+
|
| 467 |
if not detection_results:
|
| 468 |
logger.warning("[VectorizationAgent] No texts for trending detection")
|
| 469 |
return {
|
|
|
|
| 475 |
"spike_alerts": [],
|
| 476 |
},
|
| 477 |
}
|
| 478 |
+
|
| 479 |
# Import trending detector
|
| 480 |
try:
|
| 481 |
from src.utils.trending_detector import (
|
|
|
|
| 484 |
get_trending_now,
|
| 485 |
get_spikes,
|
| 486 |
)
|
| 487 |
+
|
| 488 |
TRENDING_AVAILABLE = True
|
| 489 |
except ImportError as e:
|
| 490 |
logger.warning(f"[VectorizationAgent] Trending detector not available: {e}")
|
| 491 |
TRENDING_AVAILABLE = False
|
| 492 |
+
|
| 493 |
if not TRENDING_AVAILABLE:
|
| 494 |
return {
|
| 495 |
"current_step": "trending_detection",
|
|
|
|
| 500 |
"spike_alerts": [],
|
| 501 |
},
|
| 502 |
}
|
| 503 |
+
|
| 504 |
# Extract entities and record mentions
|
| 505 |
entities_found = []
|
| 506 |
+
|
| 507 |
for item in detection_results:
|
| 508 |
text = item.get("text", "") # Field is 'text', not 'original_text'
|
| 509 |
language = item.get("language", "english")
|
| 510 |
post_id = item.get("post_id", "")
|
| 511 |
+
|
| 512 |
# Simple entity extraction (keywords, capitalized words, etc.)
|
| 513 |
# In production, you'd use NER or more sophisticated extraction
|
| 514 |
extracted = self._extract_entities(text, language)
|
| 515 |
+
|
| 516 |
for entity in extracted:
|
| 517 |
try:
|
| 518 |
# Record mention with trending detector
|
|
|
|
| 521 |
source=entity.get("source", "feed"),
|
| 522 |
domain=entity.get("domain", "general"),
|
| 523 |
)
|
| 524 |
+
entities_found.append(
|
| 525 |
+
{
|
| 526 |
+
"entity": entity["text"],
|
| 527 |
+
"type": entity.get("type", "keyword"),
|
| 528 |
+
"post_id": post_id,
|
| 529 |
+
"language": language,
|
| 530 |
+
}
|
| 531 |
+
)
|
| 532 |
except Exception as e:
|
| 533 |
logger.debug(f"[VectorizationAgent] Failed to record mention: {e}")
|
| 534 |
+
|
| 535 |
# Get current trending topics and spikes
|
| 536 |
try:
|
| 537 |
trending_topics = get_trending_now(limit=10)
|
|
|
|
| 540 |
logger.warning(f"[VectorizationAgent] Failed to get trending data: {e}")
|
| 541 |
trending_topics = []
|
| 542 |
spike_alerts = []
|
| 543 |
+
|
| 544 |
logger.info(
|
| 545 |
f"[VectorizationAgent] Trending detection: {len(entities_found)} entities, "
|
| 546 |
f"{len(trending_topics)} trending, {len(spike_alerts)} spikes"
|
| 547 |
)
|
| 548 |
+
|
| 549 |
return {
|
| 550 |
"current_step": "trending_detection",
|
| 551 |
"trending_results": {
|
|
|
|
| 556 |
"spike_alerts": spike_alerts,
|
| 557 |
},
|
| 558 |
}
|
| 559 |
+
|
| 560 |
+
def _extract_entities(
|
| 561 |
+
self, text: str, language: str = "english"
|
| 562 |
+
) -> List[Dict[str, Any]]:
|
| 563 |
"""
|
| 564 |
Extract entities/topics from text for trending tracking.
|
| 565 |
+
|
| 566 |
Uses simple heuristics:
|
| 567 |
- Capitalized words/phrases (potential proper nouns)
|
| 568 |
- Hashtags
|
| 569 |
- Common news keywords
|
| 570 |
+
|
| 571 |
In production, integrate with NER model for better extraction.
|
| 572 |
"""
|
| 573 |
entities = []
|
| 574 |
+
|
| 575 |
if not text:
|
| 576 |
return entities
|
| 577 |
+
|
| 578 |
import re
|
| 579 |
+
|
| 580 |
# Extract hashtags
|
| 581 |
+
hashtags = re.findall(r"#(\w+)", text)
|
| 582 |
for tag in hashtags:
|
| 583 |
+
entities.append(
|
| 584 |
+
{
|
| 585 |
+
"text": tag.lower(),
|
| 586 |
+
"type": "hashtag",
|
| 587 |
+
"source": "hashtag",
|
| 588 |
+
"domain": "social",
|
| 589 |
+
}
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
# Extract capitalized phrases (potential proper nouns)
|
| 593 |
# Match 1-4 consecutive capitalized words
|
| 594 |
+
cap_phrases = re.findall(r"\b([A-Z][a-z]+(?: [A-Z][a-z]+){0,3})\b", text)
|
| 595 |
for phrase in cap_phrases:
|
| 596 |
# Skip common words
|
| 597 |
+
if phrase.lower() not in [
|
| 598 |
+
"the",
|
| 599 |
+
"a",
|
| 600 |
+
"an",
|
| 601 |
+
"is",
|
| 602 |
+
"are",
|
| 603 |
+
"was",
|
| 604 |
+
"were",
|
| 605 |
+
"i",
|
| 606 |
+
"he",
|
| 607 |
+
"she",
|
| 608 |
+
"it",
|
| 609 |
+
]:
|
| 610 |
+
entities.append(
|
| 611 |
+
{
|
| 612 |
+
"text": phrase,
|
| 613 |
+
"type": "proper_noun",
|
| 614 |
+
"source": "text",
|
| 615 |
+
"domain": "general",
|
| 616 |
+
}
|
| 617 |
+
)
|
| 618 |
+
|
| 619 |
# News/event keywords
|
| 620 |
news_keywords = [
|
| 621 |
+
"breaking",
|
| 622 |
+
"urgent",
|
| 623 |
+
"alert",
|
| 624 |
+
"emergency",
|
| 625 |
+
"crisis",
|
| 626 |
+
"earthquake",
|
| 627 |
+
"flood",
|
| 628 |
+
"tsunami",
|
| 629 |
+
"election",
|
| 630 |
+
"protest",
|
| 631 |
+
"strike",
|
| 632 |
+
"scandal",
|
| 633 |
+
"corruption",
|
| 634 |
+
"price",
|
| 635 |
+
"inflation",
|
| 636 |
]
|
| 637 |
+
|
| 638 |
text_lower = text.lower()
|
| 639 |
for keyword in news_keywords:
|
| 640 |
if keyword in text_lower:
|
| 641 |
+
entities.append(
|
| 642 |
+
{
|
| 643 |
+
"text": keyword,
|
| 644 |
+
"type": "news_keyword",
|
| 645 |
+
"source": "keyword_match",
|
| 646 |
+
"domain": "news",
|
| 647 |
+
}
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
# Deduplicate by text
|
| 651 |
seen = set()
|
| 652 |
unique_entities = []
|
|
|
|
| 655 |
if key not in seen:
|
| 656 |
seen.add(key)
|
| 657 |
unique_entities.append(e)
|
|
|
|
|
|
|
| 658 |
|
| 659 |
+
return unique_entities[:15] # Limit entities per text
|
| 660 |
|
| 661 |
def generate_expert_summary(self, state: VectorizationAgentState) -> Dict[str, Any]:
|
| 662 |
"""
|