Commit Β·
b75c637
0
Parent(s):
deploy: HF sanitized GUI snapshot
Browse files- .dockerignore +59 -0
- .github/workflows/hf_sync.yml +126 -0
- .gitignore +84 -0
- Dockerfile +56 -0
- LICENSE +5 -0
- README.md +38 -0
- engine/__init__.py +13 -0
- engine/__main__.py +6 -0
- engine/io_contract.py +144 -0
- engine/normalize.py +236 -0
- engine/pipeline_orchestrator.py +275 -0
- engine/registry.py +134 -0
- engine/reporting.py +208 -0
- engine/storage.py +189 -0
- engine/templates/report.html +171 -0
- gui/__init__.py +5 -0
- gui/streamlit_app.py +332 -0
- gui/terminal_panel.py +127 -0
- modules/README.txt +14 -0
- modules/__init__.py +0 -0
- modules/correlation/correlate.py +126 -0
- modules/correlation/correlate_ioc.py +23 -0
- modules/export/export_results.py +104 -0
- modules/ingestion/__init__.py +0 -0
- modules/ingestion/gathering/web_scraper.py +6 -0
- modules/ingestion/ingest_data.py +54 -0
- modules/ml_analysis/ml_analysis.py +119 -0
- modules/preprocessing/preprocess_data.py +75 -0
- pipeline_config.yaml +15 -0
- requirements-hf.txt +17 -0
- samples/demo_ingest/example.csv +2 -0
- samples/demo_ingest/example.html +1 -0
- samples/demo_ingest/example.json +1 -0
- samples/demo_ingest/example.txt +1 -0
- scripts/docker_entrypoint.sh +60 -0
.dockerignore
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Version control
|
| 2 |
+
.git/
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python caches
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
.eggs/
|
| 14 |
+
|
| 15 |
+
# Virtual environments
|
| 16 |
+
.venv/
|
| 17 |
+
venv/
|
| 18 |
+
env/
|
| 19 |
+
|
| 20 |
+
# Runtime outputs β never bake into image
|
| 21 |
+
runs/
|
| 22 |
+
artifacts/
|
| 23 |
+
logs/
|
| 24 |
+
*.log
|
| 25 |
+
*.sqlite
|
| 26 |
+
*.db
|
| 27 |
+
|
| 28 |
+
# Secrets / env
|
| 29 |
+
*.env
|
| 30 |
+
.env
|
| 31 |
+
.env.*
|
| 32 |
+
Vault/
|
| 33 |
+
|
| 34 |
+
# macOS
|
| 35 |
+
.DS_Store
|
| 36 |
+
.AppleDouble
|
| 37 |
+
|
| 38 |
+
# CI / audit artefacts
|
| 39 |
+
.branch_audit/
|
| 40 |
+
*.csv
|
| 41 |
+
*_output.json
|
| 42 |
+
safety_report.json
|
| 43 |
+
|
| 44 |
+
# Heavy model files
|
| 45 |
+
models/gguf/
|
| 46 |
+
ml_models/
|
| 47 |
+
*.gguf
|
| 48 |
+
*.bin
|
| 49 |
+
*.pt
|
| 50 |
+
*.pth
|
| 51 |
+
|
| 52 |
+
# Dev-only docs / notebooks
|
| 53 |
+
*.ipynb
|
| 54 |
+
*.odt
|
| 55 |
+
*.tex
|
| 56 |
+
*.pdf
|
| 57 |
+
|
| 58 |
+
# Legacy dashboard (not served by container)
|
| 59 |
+
dashboard/
|
.github/workflows/hf_sync.yml
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to HF Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- beta/sanitized-minimal
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
inputs:
|
| 9 |
+
ref:
|
| 10 |
+
description: "Branch or SHA to deploy (default: beta/sanitized-minimal)"
|
| 11 |
+
required: false
|
| 12 |
+
default: "beta/sanitized-minimal"
|
| 13 |
+
|
| 14 |
+
concurrency:
|
| 15 |
+
group: hf-space-sync
|
| 16 |
+
cancel-in-progress: true
|
| 17 |
+
|
| 18 |
+
jobs:
|
| 19 |
+
hf-sync:
|
| 20 |
+
runs-on: ubuntu-latest
|
| 21 |
+
timeout-minutes: 15
|
| 22 |
+
|
| 23 |
+
steps:
|
| 24 |
+
- name: Checkout deployment ref
|
| 25 |
+
uses: actions/checkout@v4
|
| 26 |
+
with:
|
| 27 |
+
fetch-depth: 0
|
| 28 |
+
lfs: true
|
| 29 |
+
ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.ref }}
|
| 30 |
+
|
| 31 |
+
- name: Resolve checked out ref
|
| 32 |
+
id: ref
|
| 33 |
+
run: |
|
| 34 |
+
BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD)
|
| 35 |
+
if [ "$BRANCH_NAME" = "HEAD" ]; then
|
| 36 |
+
BRANCH_NAME=$(git name-rev --name-only --exclude=tags/* HEAD | sed 's#^remotes/origin/##')
|
| 37 |
+
fi
|
| 38 |
+
echo "branch_name=$BRANCH_NAME" >> "$GITHUB_OUTPUT"
|
| 39 |
+
echo "Deploying ref: $BRANCH_NAME @ $(git rev-parse --short HEAD)"
|
| 40 |
+
|
| 41 |
+
- name: Enforce sanitized deployment policy
|
| 42 |
+
run: |
|
| 43 |
+
set -euo pipefail
|
| 44 |
+
|
| 45 |
+
echo "Checking required runtime files..."
|
| 46 |
+
required_files=(
|
| 47 |
+
"README.md"
|
| 48 |
+
"Dockerfile"
|
| 49 |
+
"requirements-hf.txt"
|
| 50 |
+
"gui/streamlit_app.py"
|
| 51 |
+
"engine/pipeline_orchestrator.py"
|
| 52 |
+
"scripts/docker_entrypoint.sh"
|
| 53 |
+
)
|
| 54 |
+
for f in "${required_files[@]}"; do
|
| 55 |
+
[ -f "$f" ] || { echo "::error::Missing required file: $f"; exit 1; }
|
| 56 |
+
done
|
| 57 |
+
|
| 58 |
+
echo "Checking blocked paths/files..."
|
| 59 |
+
blocked_paths=(
|
| 60 |
+
"Docs"
|
| 61 |
+
"Docs/audit"
|
| 62 |
+
"Vault"
|
| 63 |
+
"logs"
|
| 64 |
+
"runs"
|
| 65 |
+
".env"
|
| 66 |
+
"TODO.md"
|
| 67 |
+
"docs/progress.md"
|
| 68 |
+
)
|
| 69 |
+
violations=0
|
| 70 |
+
for p in "${blocked_paths[@]}"; do
|
| 71 |
+
if [ -e "$p" ]; then
|
| 72 |
+
echo "::error::Blocked path present in deployment ref: $p"
|
| 73 |
+
violations=1
|
| 74 |
+
fi
|
| 75 |
+
done
|
| 76 |
+
[ "$violations" -eq 0 ]
|
| 77 |
+
|
| 78 |
+
echo "Checking for obvious secret patterns..."
|
| 79 |
+
if command -v rg >/dev/null 2>&1; then
|
| 80 |
+
rg -n --hidden \
|
| 81 |
+
-g '!.git/**' \
|
| 82 |
+
-g '!**/*.md' \
|
| 83 |
+
-e 'AKIA[0-9A-Z]{16}' \
|
| 84 |
+
-e 'ASIA[0-9A-Z]{16}' \
|
| 85 |
+
-e 'ghp_[A-Za-z0-9]{36}' \
|
| 86 |
+
-e 'hf_[A-Za-z0-9]{30,}' \
|
| 87 |
+
-e 'sk_live_[0-9A-Za-z]{20,}' \
|
| 88 |
+
-e '-----BEGIN (RSA|EC|OPENSSH|DSA) PRIVATE KEY-----' \
|
| 89 |
+
-e 'xox[baprs]-[A-Za-z0-9-]{10,}' \
|
| 90 |
+
. && { echo "::error::Potential secret detected"; exit 1; } || true
|
| 91 |
+
fi
|
| 92 |
+
|
| 93 |
+
- name: Check large files (>10 MB)
|
| 94 |
+
run: |
|
| 95 |
+
set -euo pipefail
|
| 96 |
+
large_files=$(find . -not -path './.git/*' -type f -size +10M 2>/dev/null || true)
|
| 97 |
+
if [ -n "$large_files" ]; then
|
| 98 |
+
echo "::warning::Large files detected (>10MB):"
|
| 99 |
+
echo "$large_files"
|
| 100 |
+
echo "Consider Git LFS for required large assets."
|
| 101 |
+
else
|
| 102 |
+
echo "No large files >10MB found."
|
| 103 |
+
fi
|
| 104 |
+
|
| 105 |
+
- name: Push to HF Space
|
| 106 |
+
env:
|
| 107 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 108 |
+
run: |
|
| 109 |
+
set -euo pipefail
|
| 110 |
+
|
| 111 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 112 |
+
echo "::error::HF_TOKEN secret is not set. Configure it in GitHub Actions secrets."
|
| 113 |
+
exit 1
|
| 114 |
+
fi
|
| 115 |
+
|
| 116 |
+
git config user.email "ci@github.com"
|
| 117 |
+
git config user.name "CI Bot"
|
| 118 |
+
|
| 119 |
+
git remote remove hf 2>/dev/null || true
|
| 120 |
+
git remote add hf "https://moddux:${HF_TOKEN}@huggingface.co/spaces/moddux/mod-osint"
|
| 121 |
+
|
| 122 |
+
echo "Pushing $(git rev-parse --short HEAD) from '${{ steps.ref.outputs.branch_name }}' -> HF main"
|
| 123 |
+
git push hf HEAD:main --force-with-lease
|
| 124 |
+
|
| 125 |
+
echo "HF deploy push complete."
|
| 126 |
+
echo "Logs: https://huggingface.co/spaces/moddux/mod-osint/logs"
|
.gitignore
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===== Python =====
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.egg-info/
|
| 7 |
+
*.egg
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.whl
|
| 11 |
+
|
| 12 |
+
# ===== Virtual environments =====
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
env/
|
| 16 |
+
|
| 17 |
+
# ===== IDE / OS =====
|
| 18 |
+
.git/
|
| 19 |
+
.DS_Store
|
| 20 |
+
._*
|
| 21 |
+
.idea/
|
| 22 |
+
.vscode/
|
| 23 |
+
*.swp
|
| 24 |
+
*.swo
|
| 25 |
+
*~
|
| 26 |
+
|
| 27 |
+
# ===== Environment / secrets =====
|
| 28 |
+
*.env
|
| 29 |
+
.env.*
|
| 30 |
+
|
| 31 |
+
# ===== Logs =====
|
| 32 |
+
*.log
|
| 33 |
+
logs/
|
| 34 |
+
runner/logs/
|
| 35 |
+
|
| 36 |
+
# ===== Databases =====
|
| 37 |
+
*.sqlite
|
| 38 |
+
*.sqlite3
|
| 39 |
+
*.db
|
| 40 |
+
|
| 41 |
+
# ===== Run artifacts (engine output) =====
|
| 42 |
+
runs/
|
| 43 |
+
output/
|
| 44 |
+
*_output.json
|
| 45 |
+
*_output.csv
|
| 46 |
+
|
| 47 |
+
# ===== Legacy generated artifacts =====
|
| 48 |
+
artifacts/*.json
|
| 49 |
+
artifacts/*.csv
|
| 50 |
+
artifacts/*.html
|
| 51 |
+
artifacts/*.svg
|
| 52 |
+
artifacts/*.pdf
|
| 53 |
+
NETLOG*
|
| 54 |
+
DNSESS*
|
| 55 |
+
graph.cypher
|
| 56 |
+
|
| 57 |
+
# ===== Data directory (user-supplied, not tracked) =====
|
| 58 |
+
Data/
|
| 59 |
+
|
| 60 |
+
# ===== ML models (large binaries) =====
|
| 61 |
+
venv/models/gguf/*.gguf
|
| 62 |
+
models/
|
| 63 |
+
*.gguf
|
| 64 |
+
*.bin
|
| 65 |
+
*.safetensors
|
| 66 |
+
|
| 67 |
+
# ===== Nested repo (do not track) =====
|
| 68 |
+
MOD_OSINT/
|
| 69 |
+
|
| 70 |
+
# ===== Legacy uppercase Modules (quarantined, kept on disk) =====
|
| 71 |
+
# NOTE: On case-insensitive filesystems (macOS), we cannot gitignore
|
| 72 |
+
# "Modules/" without also blocking "modules/". The uppercase files
|
| 73 |
+
# were removed from the index in commit 1284d1a. We rely on the
|
| 74 |
+
# index removal rather than .gitignore for Modules/.
|
| 75 |
+
|
| 76 |
+
# ===== Misc generated =====
|
| 77 |
+
pipeline_status.json
|
| 78 |
+
*.zip
|
| 79 |
+
*.tar.gz
|
| 80 |
+
htmlcov/
|
| 81 |
+
.coverage
|
| 82 |
+
.pytest_cache/
|
| 83 |
+
.mypy_cache/
|
| 84 |
+
.ruff_cache/
|
Dockerfile
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MOD-OSINT β Hugging Face Docker Space
|
| 2 |
+
# Runs the GUI wizard wired to engine/pipeline_orchestrator.py on port 7860.
|
| 3 |
+
FROM python:3.11-slim
|
| 4 |
+
|
| 5 |
+
ARG BUILD_DATE="unknown"
|
| 6 |
+
ARG VCS_REF="unknown"
|
| 7 |
+
|
| 8 |
+
LABEL maintainer="moddux" \
|
| 9 |
+
org.opencontainers.image.title="MOD-OSINT" \
|
| 10 |
+
org.opencontainers.image.description="MOD-OSINT Streamlit GUI for HF Docker Space" \
|
| 11 |
+
org.opencontainers.image.source="https://github.com/moddux/MOD-OSINT" \
|
| 12 |
+
org.opencontainers.image.created="${BUILD_DATE}" \
|
| 13 |
+
org.opencontainers.image.revision="${VCS_REF}"
|
| 14 |
+
|
| 15 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 16 |
+
PYTHONUNBUFFERED=1 \
|
| 17 |
+
PIP_NO_CACHE_DIR=1 \
|
| 18 |
+
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
|
| 19 |
+
STREAMLIT_SERVER_HEADLESS=true \
|
| 20 |
+
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
|
| 21 |
+
STREAMLIT_SERVER_PORT=7860
|
| 22 |
+
|
| 23 |
+
RUN apt-get update \
|
| 24 |
+
&& apt-get install -y --no-install-recommends \
|
| 25 |
+
ca-certificates \
|
| 26 |
+
git \
|
| 27 |
+
curl \
|
| 28 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 29 |
+
|
| 30 |
+
WORKDIR /app
|
| 31 |
+
|
| 32 |
+
# Install runtime dependencies first for cache reuse.
|
| 33 |
+
COPY requirements-hf.txt ./requirements-hf.txt
|
| 34 |
+
RUN python -m pip install --upgrade pip \
|
| 35 |
+
&& pip install --no-cache-dir -r requirements-hf.txt
|
| 36 |
+
|
| 37 |
+
# Create runtime user before copying app files.
|
| 38 |
+
RUN useradd -m -u 1000 appuser
|
| 39 |
+
|
| 40 |
+
# Copy application source (honors .dockerignore).
|
| 41 |
+
COPY --chown=appuser:appuser . .
|
| 42 |
+
|
| 43 |
+
# Runtime dirs and entrypoint permissions.
|
| 44 |
+
RUN mkdir -p /app/runs /app/logs \
|
| 45 |
+
&& chown -R appuser:appuser /app/runs /app/logs \
|
| 46 |
+
&& chmod 775 /app/runs /app/logs \
|
| 47 |
+
&& chmod +x /app/scripts/docker_entrypoint.sh
|
| 48 |
+
|
| 49 |
+
USER appuser
|
| 50 |
+
|
| 51 |
+
EXPOSE 7860
|
| 52 |
+
|
| 53 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=5 \
|
| 54 |
+
CMD curl -fsS http://127.0.0.1:7860/_stcore/health || exit 1
|
| 55 |
+
|
| 56 |
+
ENTRYPOINT ["bash", "scripts/docker_entrypoint.sh"]
|
LICENSE
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy...
|
README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: mod-osint
|
| 3 |
+
emoji: "π§ "
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# MOD-OSINT (Beta Sanitized Build)
|
| 12 |
+
|
| 13 |
+
This branch is the sanitized beta deployment bundle for:
|
| 14 |
+
|
| 15 |
+
- Hugging Face Space: https://huggingface.co/spaces/moddux/mod-osint
|
| 16 |
+
|
| 17 |
+
What is included:
|
| 18 |
+
|
| 19 |
+
- Runtime engine (`engine/`)
|
| 20 |
+
- Runtime modules (`modules/`)
|
| 21 |
+
- Streamlit GUI (`gui/`)
|
| 22 |
+
- Docker runtime files (`Dockerfile`, `requirements-hf.txt`, `scripts/docker_entrypoint.sh`)
|
| 23 |
+
|
| 24 |
+
What is excluded:
|
| 25 |
+
|
| 26 |
+
- Audit logs and runtime logs
|
| 27 |
+
- Development TODO/progress notes
|
| 28 |
+
- Internal vault/secrets files
|
| 29 |
+
- Non-runtime integrations and large artifacts
|
| 30 |
+
|
| 31 |
+
## Local Run
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
docker build -t mod-osint-beta .
|
| 35 |
+
docker run --rm -p 7860:7860 mod-osint-beta
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
Then open: http://localhost:7860
|
engine/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MOD-OSINT Engine β centralized pipeline orchestration.
|
| 3 |
+
|
| 4 |
+
Submodules:
|
| 5 |
+
io_contract β Pydantic v2 typed IO schemas
|
| 6 |
+
registry β module registration + dependency graph
|
| 7 |
+
normalize β canonical schema + entity linking
|
| 8 |
+
storage β SQL storage (SQLite / Postgres)
|
| 9 |
+
reporting β HTML report + JSONL/CSV exports
|
| 10 |
+
pipeline_orchestrator β single entrypoint
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
__version__ = "0.1.0"
|
engine/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Allow ``python -m engine`` to run the pipeline orchestrator."""
|
| 2 |
+
|
| 3 |
+
from engine.pipeline_orchestrator import main
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
engine/io_contract.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
IO Contract β typed schemas for the MOD-OSINT engine.
|
| 3 |
+
|
| 4 |
+
Every pipeline module must accept ``EngineInput`` and return ``EngineOutput``.
|
| 5 |
+
The engine orchestrator builds ``RunContext`` which carries paths and DB handles.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import uuid
|
| 11 |
+
from datetime import datetime, timezone
|
| 12 |
+
from enum import Enum
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Any, Dict, List, Optional
|
| 15 |
+
|
| 16 |
+
from pydantic import BaseModel, Field
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Enums
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
class FileType(str, Enum):
|
| 24 |
+
CSV = "csv"
|
| 25 |
+
JSON = "json"
|
| 26 |
+
TXT = "txt"
|
| 27 |
+
HTML = "html"
|
| 28 |
+
LOG = "log"
|
| 29 |
+
UNKNOWN = "unknown"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class StageStatus(str, Enum):
|
| 33 |
+
PENDING = "pending"
|
| 34 |
+
RUNNING = "running"
|
| 35 |
+
SUCCESS = "success"
|
| 36 |
+
FAILED = "failed"
|
| 37 |
+
SKIPPED = "skipped"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Input specs
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
class InputFile(BaseModel):
|
| 45 |
+
"""Descriptor for a single ingested file."""
|
| 46 |
+
path: Path
|
| 47 |
+
file_type: FileType = FileType.UNKNOWN
|
| 48 |
+
size_bytes: int = 0
|
| 49 |
+
sha256: str = ""
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class InputSpec(BaseModel):
|
| 53 |
+
"""Describes the full set of input data for a pipeline run."""
|
| 54 |
+
input_dir: Path
|
| 55 |
+
files: List[InputFile] = Field(default_factory=list)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Normalized records
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
class NormalizedRecord(BaseModel):
|
| 63 |
+
"""
|
| 64 |
+
Canonical record produced by the normalization stage.
|
| 65 |
+
|
| 66 |
+
Every record gets a deterministic ``row_id`` and optional entity-linking
|
| 67 |
+
keys so downstream modules can join/correlate across sources.
|
| 68 |
+
"""
|
| 69 |
+
row_id: str = Field(default_factory=lambda: uuid.uuid4().hex[:12])
|
| 70 |
+
source_file: str = ""
|
| 71 |
+
source_type: FileType = FileType.UNKNOWN
|
| 72 |
+
timestamp: Optional[datetime] = None
|
| 73 |
+
entity_name: Optional[str] = None
|
| 74 |
+
entity_phone: Optional[str] = None
|
| 75 |
+
entity_email: Optional[str] = None
|
| 76 |
+
entity_ip: Optional[str] = None
|
| 77 |
+
entity_domain: Optional[str] = None
|
| 78 |
+
entity_hash: Optional[str] = None
|
| 79 |
+
raw_text: str = ""
|
| 80 |
+
extra: Dict[str, Any] = Field(default_factory=dict)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
# Artifacts
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
class Artifact(BaseModel):
|
| 88 |
+
"""A single output artifact produced by a module."""
|
| 89 |
+
name: str
|
| 90 |
+
path: Path
|
| 91 |
+
mime_type: str = "application/octet-stream"
|
| 92 |
+
description: str = ""
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Engine IO (module contract)
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
|
| 99 |
+
class EngineInput(BaseModel):
|
| 100 |
+
"""
|
| 101 |
+
Standard input passed to every pipeline module's ``run()`` function.
|
| 102 |
+
|
| 103 |
+
Modules receive the normalized records from prior stages plus the
|
| 104 |
+
run context (paths, config).
|
| 105 |
+
"""
|
| 106 |
+
run_id: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6])
|
| 107 |
+
input_spec: InputSpec
|
| 108 |
+
records: List[NormalizedRecord] = Field(default_factory=list)
|
| 109 |
+
config: Dict[str, Any] = Field(default_factory=dict)
|
| 110 |
+
run_dir: Path = Path("runs/default")
|
| 111 |
+
previous_artifacts: List[Artifact] = Field(default_factory=list)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class EngineOutput(BaseModel):
|
| 115 |
+
"""
|
| 116 |
+
Standard output returned by every pipeline module's ``run()`` function.
|
| 117 |
+
"""
|
| 118 |
+
stage: str
|
| 119 |
+
status: StageStatus = StageStatus.SUCCESS
|
| 120 |
+
records: List[NormalizedRecord] = Field(default_factory=list)
|
| 121 |
+
artifacts: List[Artifact] = Field(default_factory=list)
|
| 122 |
+
summary: str = ""
|
| 123 |
+
error: Optional[str] = None
|
| 124 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
# Run context (internal, built by orchestrator)
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
class RunContext(BaseModel):
|
| 132 |
+
"""
|
| 133 |
+
Internal context object built by the orchestrator for a single run.
|
| 134 |
+
Carries the run directory, DB path, and accumulated state.
|
| 135 |
+
"""
|
| 136 |
+
run_id: str
|
| 137 |
+
run_dir: Path
|
| 138 |
+
db_path: Path
|
| 139 |
+
input_spec: InputSpec
|
| 140 |
+
config: Dict[str, Any] = Field(default_factory=dict)
|
| 141 |
+
stage_results: Dict[str, EngineOutput] = Field(default_factory=dict)
|
| 142 |
+
|
| 143 |
+
class Config:
|
| 144 |
+
arbitrary_types_allowed = True
|
engine/normalize.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Normalize β canonical schema builder + entity linking keys.
|
| 3 |
+
|
| 4 |
+
Reads raw files from the input directory, detects file types, parses
|
| 5 |
+
them into ``NormalizedRecord`` instances with deterministic row IDs
|
| 6 |
+
and entity-linking fields.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import csv
|
| 12 |
+
import hashlib
|
| 13 |
+
import io
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
import re
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Any, Dict, List, Optional
|
| 19 |
+
|
| 20 |
+
from engine.io_contract import FileType, InputFile, InputSpec, NormalizedRecord
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger("engine.normalize")
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# File-type detection
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
_EXT_MAP: Dict[str, FileType] = {
|
| 29 |
+
".csv": FileType.CSV,
|
| 30 |
+
".json": FileType.JSON,
|
| 31 |
+
".txt": FileType.TXT,
|
| 32 |
+
".html": FileType.HTML,
|
| 33 |
+
".htm": FileType.HTML,
|
| 34 |
+
".log": FileType.LOG,
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def detect_file_type(path: Path) -> FileType:
|
| 39 |
+
"""Detect file type from extension."""
|
| 40 |
+
return _EXT_MAP.get(path.suffix.lower(), FileType.UNKNOWN)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _file_sha256(path: Path) -> str:
|
| 44 |
+
"""Compute SHA-256 hex digest of a file."""
|
| 45 |
+
h = hashlib.sha256()
|
| 46 |
+
with open(path, "rb") as f:
|
| 47 |
+
for chunk in iter(lambda: f.read(8192), b""):
|
| 48 |
+
h.update(chunk)
|
| 49 |
+
return h.hexdigest()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
# Build InputSpec from a directory
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
def build_input_spec(input_dir: Path) -> InputSpec:
|
| 57 |
+
"""
|
| 58 |
+
Scan *input_dir* for supported files and return an ``InputSpec``.
|
| 59 |
+
"""
|
| 60 |
+
input_dir = Path(input_dir)
|
| 61 |
+
files: List[InputFile] = []
|
| 62 |
+
if input_dir.is_file():
|
| 63 |
+
# Single file mode
|
| 64 |
+
ft = detect_file_type(input_dir)
|
| 65 |
+
files.append(InputFile(
|
| 66 |
+
path=input_dir,
|
| 67 |
+
file_type=ft,
|
| 68 |
+
size_bytes=input_dir.stat().st_size,
|
| 69 |
+
sha256=_file_sha256(input_dir),
|
| 70 |
+
))
|
| 71 |
+
input_dir = input_dir.parent
|
| 72 |
+
else:
|
| 73 |
+
for p in sorted(input_dir.iterdir()):
|
| 74 |
+
if p.is_file() and not p.name.startswith("."):
|
| 75 |
+
ft = detect_file_type(p)
|
| 76 |
+
files.append(InputFile(
|
| 77 |
+
path=p,
|
| 78 |
+
file_type=ft,
|
| 79 |
+
size_bytes=p.stat().st_size,
|
| 80 |
+
sha256=_file_sha256(p),
|
| 81 |
+
))
|
| 82 |
+
logger.info("InputSpec: %d files from %s", len(files), input_dir)
|
| 83 |
+
return InputSpec(input_dir=input_dir, files=files)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Deterministic row ID
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
def _make_row_id(source_file: str, index: int, content_hash: str) -> str:
|
| 91 |
+
"""
|
| 92 |
+
Deterministic row ID = first 12 hex chars of SHA-256(source_file + index + content).
|
| 93 |
+
"""
|
| 94 |
+
raw = f"{source_file}:{index}:{content_hash}"
|
| 95 |
+
return hashlib.sha256(raw.encode()).hexdigest()[:12]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
# Entity extraction helpers (lightweight, no ML)
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
|
| 102 |
+
_EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
|
| 103 |
+
_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
|
| 104 |
+
_PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
|
| 105 |
+
_DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
|
| 106 |
+
_HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _extract_entities(text: str) -> Dict[str, Optional[str]]:
|
| 110 |
+
"""Extract first occurrence of common entity types from text."""
|
| 111 |
+
email_m = _EMAIL_RE.search(text)
|
| 112 |
+
ip_m = _IP_RE.search(text)
|
| 113 |
+
phone_m = _PHONE_RE.search(text)
|
| 114 |
+
domain_m = _DOMAIN_RE.search(text)
|
| 115 |
+
hash_m = _HASH_RE.search(text)
|
| 116 |
+
return {
|
| 117 |
+
"entity_email": email_m.group(0) if email_m else None,
|
| 118 |
+
"entity_ip": ip_m.group(0) if ip_m else None,
|
| 119 |
+
"entity_phone": phone_m.group(0) if phone_m else None,
|
| 120 |
+
"entity_domain": domain_m.group(0) if domain_m else None,
|
| 121 |
+
"entity_hash": hash_m.group(0) if hash_m else None,
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
# Parsers per file type
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
|
| 129 |
+
def _parse_csv(path: Path) -> List[NormalizedRecord]:
|
| 130 |
+
records: List[NormalizedRecord] = []
|
| 131 |
+
with open(path, newline="", encoding="utf-8", errors="replace") as f:
|
| 132 |
+
reader = csv.DictReader(f)
|
| 133 |
+
for idx, row in enumerate(reader):
|
| 134 |
+
text = json.dumps(row, ensure_ascii=False)
|
| 135 |
+
content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
|
| 136 |
+
entities = _extract_entities(text)
|
| 137 |
+
# Try to pick up common column names
|
| 138 |
+
name = row.get("name") or row.get("Name") or row.get("entity_name")
|
| 139 |
+
phone = row.get("phone") or row.get("Phone") or row.get("entity_phone")
|
| 140 |
+
email = row.get("email") or row.get("Email") or row.get("entity_email")
|
| 141 |
+
records.append(NormalizedRecord(
|
| 142 |
+
row_id=_make_row_id(str(path), idx, content_hash),
|
| 143 |
+
source_file=str(path.name),
|
| 144 |
+
source_type=FileType.CSV,
|
| 145 |
+
entity_name=name or entities.get("entity_name"),
|
| 146 |
+
entity_phone=phone or entities.get("entity_phone"),
|
| 147 |
+
entity_email=email or entities.get("entity_email"),
|
| 148 |
+
entity_ip=entities.get("entity_ip"),
|
| 149 |
+
entity_domain=entities.get("entity_domain"),
|
| 150 |
+
entity_hash=entities.get("entity_hash"),
|
| 151 |
+
raw_text=text,
|
| 152 |
+
extra=dict(row),
|
| 153 |
+
))
|
| 154 |
+
return records
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _parse_json(path: Path) -> List[NormalizedRecord]:
|
| 158 |
+
records: List[NormalizedRecord] = []
|
| 159 |
+
with open(path, encoding="utf-8", errors="replace") as f:
|
| 160 |
+
data = json.load(f)
|
| 161 |
+
# Handle both single object and list of objects
|
| 162 |
+
items = data if isinstance(data, list) else [data]
|
| 163 |
+
for idx, item in enumerate(items):
|
| 164 |
+
text = json.dumps(item, ensure_ascii=False) if isinstance(item, dict) else str(item)
|
| 165 |
+
content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
|
| 166 |
+
entities = _extract_entities(text)
|
| 167 |
+
extra = item if isinstance(item, dict) else {"value": item}
|
| 168 |
+
records.append(NormalizedRecord(
|
| 169 |
+
row_id=_make_row_id(str(path), idx, content_hash),
|
| 170 |
+
source_file=str(path.name),
|
| 171 |
+
source_type=FileType.JSON,
|
| 172 |
+
entity_name=extra.get("name") if isinstance(extra, dict) else None,
|
| 173 |
+
entity_email=entities.get("entity_email"),
|
| 174 |
+
entity_ip=entities.get("entity_ip"),
|
| 175 |
+
entity_phone=entities.get("entity_phone"),
|
| 176 |
+
entity_domain=entities.get("entity_domain"),
|
| 177 |
+
entity_hash=entities.get("entity_hash"),
|
| 178 |
+
raw_text=text,
|
| 179 |
+
extra=extra,
|
| 180 |
+
))
|
| 181 |
+
return records
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _parse_text(path: Path, file_type: FileType = FileType.TXT) -> List[NormalizedRecord]:
|
| 185 |
+
"""Parse plain text / HTML / log files β one record per non-empty line."""
|
| 186 |
+
records: List[NormalizedRecord] = []
|
| 187 |
+
with open(path, encoding="utf-8", errors="replace") as f:
|
| 188 |
+
lines = f.readlines()
|
| 189 |
+
for idx, line in enumerate(lines):
|
| 190 |
+
line = line.strip()
|
| 191 |
+
if not line:
|
| 192 |
+
continue
|
| 193 |
+
content_hash = hashlib.sha256(line.encode()).hexdigest()[:16]
|
| 194 |
+
entities = _extract_entities(line)
|
| 195 |
+
records.append(NormalizedRecord(
|
| 196 |
+
row_id=_make_row_id(str(path), idx, content_hash),
|
| 197 |
+
source_file=str(path.name),
|
| 198 |
+
source_type=file_type,
|
| 199 |
+
entity_email=entities.get("entity_email"),
|
| 200 |
+
entity_ip=entities.get("entity_ip"),
|
| 201 |
+
entity_phone=entities.get("entity_phone"),
|
| 202 |
+
entity_domain=entities.get("entity_domain"),
|
| 203 |
+
entity_hash=entities.get("entity_hash"),
|
| 204 |
+
raw_text=line,
|
| 205 |
+
))
|
| 206 |
+
return records
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ---------------------------------------------------------------------------
|
| 210 |
+
# Main normalization entry point
|
| 211 |
+
# ---------------------------------------------------------------------------
|
| 212 |
+
|
| 213 |
+
def normalize_files(input_spec: InputSpec) -> List[NormalizedRecord]:
|
| 214 |
+
"""
|
| 215 |
+
Parse all files in *input_spec* and return a flat list of
|
| 216 |
+
``NormalizedRecord`` instances.
|
| 217 |
+
"""
|
| 218 |
+
all_records: List[NormalizedRecord] = []
|
| 219 |
+
for f in input_spec.files:
|
| 220 |
+
try:
|
| 221 |
+
if f.file_type == FileType.CSV:
|
| 222 |
+
recs = _parse_csv(f.path)
|
| 223 |
+
elif f.file_type == FileType.JSON:
|
| 224 |
+
recs = _parse_json(f.path)
|
| 225 |
+
elif f.file_type in (FileType.TXT, FileType.LOG):
|
| 226 |
+
recs = _parse_text(f.path, f.file_type)
|
| 227 |
+
elif f.file_type == FileType.HTML:
|
| 228 |
+
recs = _parse_text(f.path, FileType.HTML)
|
| 229 |
+
else:
|
| 230 |
+
recs = _parse_text(f.path, FileType.UNKNOWN)
|
| 231 |
+
logger.info("Parsed %d records from %s (%s)", len(recs), f.path.name, f.file_type.value)
|
| 232 |
+
all_records.extend(recs)
|
| 233 |
+
except Exception as exc:
|
| 234 |
+
logger.error("Failed to parse %s: %s", f.path, exc)
|
| 235 |
+
logger.info("Total normalized records: %d", len(all_records))
|
| 236 |
+
return all_records
|
engine/pipeline_orchestrator.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pipeline Orchestrator β single entrypoint for MOD-OSINT pipeline execution.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python -m engine.pipeline_orchestrator --input samples/demo_ingest/
|
| 6 |
+
|
| 7 |
+
Flow:
|
| 8 |
+
1. Build InputSpec from --input path
|
| 9 |
+
2. Normalize files β NormalizedRecords
|
| 10 |
+
3. Discover & register pipeline modules
|
| 11 |
+
4. Run each stage in order: ingestion β preprocessing β analysis β correlation β export
|
| 12 |
+
5. Store records + stage results in SQLite (or Postgres)
|
| 13 |
+
6. Generate HTML report + JSONL/CSV exports
|
| 14 |
+
7. Write run manifest
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import json
|
| 21 |
+
import logging
|
| 22 |
+
import sys
|
| 23 |
+
import uuid
|
| 24 |
+
from datetime import datetime, timezone
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Dict, List
|
| 27 |
+
|
| 28 |
+
from engine.io_contract import (
|
| 29 |
+
Artifact,
|
| 30 |
+
EngineInput,
|
| 31 |
+
EngineOutput,
|
| 32 |
+
InputSpec,
|
| 33 |
+
NormalizedRecord,
|
| 34 |
+
RunContext,
|
| 35 |
+
StageStatus,
|
| 36 |
+
)
|
| 37 |
+
from engine.normalize import build_input_spec, normalize_files
|
| 38 |
+
from engine.registry import discover_and_register, get_ordered_stages, get_stage
|
| 39 |
+
from engine.reporting import generate_report
|
| 40 |
+
from engine.storage import StorageBackend, create_storage
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# Logging setup
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
logging.basicConfig(
|
| 47 |
+
level=logging.INFO,
|
| 48 |
+
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
| 49 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 50 |
+
)
|
| 51 |
+
logger = logging.getLogger("engine.orchestrator")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
# Run directory
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
|
| 58 |
+
def _create_run_dir(base: Path = Path("runs")) -> tuple[str, Path]:
|
| 59 |
+
"""Create a deterministic run directory and return (run_id, run_dir)."""
|
| 60 |
+
run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6]
|
| 61 |
+
run_dir = base / run_id
|
| 62 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
return run_id, run_dir
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
# Core pipeline execution
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
def run_pipeline(
|
| 71 |
+
input_path: str | Path,
|
| 72 |
+
config: Dict | None = None,
|
| 73 |
+
runs_base: Path = Path("runs"),
|
| 74 |
+
) -> RunContext:
|
| 75 |
+
"""
|
| 76 |
+
Execute the full pipeline.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
input_path: Path to input directory or single file.
|
| 80 |
+
config: Optional config overrides.
|
| 81 |
+
runs_base: Base directory for run outputs.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
RunContext with all results.
|
| 85 |
+
"""
|
| 86 |
+
config = config or {}
|
| 87 |
+
|
| 88 |
+
# 1. Create run directory
|
| 89 |
+
run_id, run_dir = _create_run_dir(runs_base)
|
| 90 |
+
logger.info("βββ Pipeline run %s βββ", run_id)
|
| 91 |
+
logger.info("Run directory: %s", run_dir)
|
| 92 |
+
|
| 93 |
+
# 2. Build input spec
|
| 94 |
+
input_path = Path(input_path)
|
| 95 |
+
if not input_path.exists():
|
| 96 |
+
logger.error("Input path does not exist: %s", input_path)
|
| 97 |
+
sys.exit(1)
|
| 98 |
+
input_spec = build_input_spec(input_path)
|
| 99 |
+
logger.info("Input: %d files from %s", len(input_spec.files), input_spec.input_dir)
|
| 100 |
+
|
| 101 |
+
# 3. Normalize files
|
| 102 |
+
records = normalize_files(input_spec)
|
| 103 |
+
logger.info("Normalized: %d records", len(records))
|
| 104 |
+
|
| 105 |
+
# 4. Set up storage
|
| 106 |
+
db_path = run_dir / "db.sqlite"
|
| 107 |
+
storage = create_storage(db_path)
|
| 108 |
+
|
| 109 |
+
# 5. Store initial normalized records
|
| 110 |
+
storage.insert_records(records)
|
| 111 |
+
|
| 112 |
+
# 6. Build run context
|
| 113 |
+
ctx = RunContext(
|
| 114 |
+
run_id=run_id,
|
| 115 |
+
run_dir=run_dir,
|
| 116 |
+
db_path=db_path,
|
| 117 |
+
input_spec=input_spec,
|
| 118 |
+
config=config,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# 7. Discover and register modules
|
| 122 |
+
registered = discover_and_register()
|
| 123 |
+
logger.info("Registered stages: %s", registered)
|
| 124 |
+
|
| 125 |
+
# 8. Run each stage
|
| 126 |
+
stage_outputs: Dict[str, EngineOutput] = {}
|
| 127 |
+
current_records = records
|
| 128 |
+
all_artifacts: List[Artifact] = []
|
| 129 |
+
|
| 130 |
+
for stage_name in get_ordered_stages():
|
| 131 |
+
logger.info("ββ Stage: %s ββ", stage_name)
|
| 132 |
+
run_fn = get_stage(stage_name)
|
| 133 |
+
if run_fn is None:
|
| 134 |
+
logger.warning("No run function for stage '%s' β skipping", stage_name)
|
| 135 |
+
stage_outputs[stage_name] = EngineOutput(
|
| 136 |
+
stage=stage_name,
|
| 137 |
+
status=StageStatus.SKIPPED,
|
| 138 |
+
summary="No run function found",
|
| 139 |
+
)
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
# Build stage input
|
| 143 |
+
engine_input = EngineInput(
|
| 144 |
+
run_id=run_id,
|
| 145 |
+
input_spec=input_spec,
|
| 146 |
+
records=current_records,
|
| 147 |
+
config=config,
|
| 148 |
+
run_dir=run_dir,
|
| 149 |
+
previous_artifacts=all_artifacts,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
output = run_fn(engine_input)
|
| 154 |
+
if not isinstance(output, EngineOutput):
|
| 155 |
+
# Wrap legacy return values
|
| 156 |
+
output = EngineOutput(
|
| 157 |
+
stage=stage_name,
|
| 158 |
+
status=StageStatus.SUCCESS,
|
| 159 |
+
records=current_records,
|
| 160 |
+
summary=str(output) if output else "completed",
|
| 161 |
+
)
|
| 162 |
+
# Update records if the stage produced new ones
|
| 163 |
+
if output.records:
|
| 164 |
+
current_records = output.records
|
| 165 |
+
stage_outputs[stage_name] = output
|
| 166 |
+
all_artifacts.extend(output.artifacts)
|
| 167 |
+
logger.info(" β %s: %s", stage_name, output.status.value)
|
| 168 |
+
if output.summary:
|
| 169 |
+
logger.info(" %s", output.summary)
|
| 170 |
+
except Exception as exc:
|
| 171 |
+
logger.error(" β %s failed: %s", stage_name, exc, exc_info=True)
|
| 172 |
+
stage_outputs[stage_name] = EngineOutput(
|
| 173 |
+
stage=stage_name,
|
| 174 |
+
status=StageStatus.FAILED,
|
| 175 |
+
error=str(exc),
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
# Persist stage result
|
| 179 |
+
storage.insert_stage_result(stage_outputs[stage_name])
|
| 180 |
+
|
| 181 |
+
# 9. Store final records (may have been enriched by stages)
|
| 182 |
+
storage.insert_records(current_records)
|
| 183 |
+
|
| 184 |
+
# 10. Generate report
|
| 185 |
+
logger.info("ββ Generating report ββ")
|
| 186 |
+
report_artifacts = generate_report(
|
| 187 |
+
run_id=run_id,
|
| 188 |
+
run_dir=run_dir,
|
| 189 |
+
input_spec=input_spec,
|
| 190 |
+
records=current_records,
|
| 191 |
+
stage_outputs=stage_outputs,
|
| 192 |
+
)
|
| 193 |
+
for a in report_artifacts:
|
| 194 |
+
storage.insert_artifact(a)
|
| 195 |
+
all_artifacts.extend(report_artifacts)
|
| 196 |
+
|
| 197 |
+
# 11. Write run manifest
|
| 198 |
+
manifest = {
|
| 199 |
+
"run_id": run_id,
|
| 200 |
+
"run_dir": str(run_dir),
|
| 201 |
+
"input_path": str(input_path),
|
| 202 |
+
"total_records": len(current_records),
|
| 203 |
+
"stages": {
|
| 204 |
+
name: {"status": out.status.value, "summary": out.summary}
|
| 205 |
+
for name, out in stage_outputs.items()
|
| 206 |
+
},
|
| 207 |
+
"artifacts": [
|
| 208 |
+
{"name": a.name, "path": str(a.path)}
|
| 209 |
+
for a in all_artifacts
|
| 210 |
+
],
|
| 211 |
+
"completed_at": datetime.now(timezone.utc).isoformat(),
|
| 212 |
+
}
|
| 213 |
+
manifest_path = run_dir / "manifest.json"
|
| 214 |
+
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
|
| 215 |
+
logger.info("Manifest: %s", manifest_path)
|
| 216 |
+
|
| 217 |
+
# 12. Close storage
|
| 218 |
+
storage.close()
|
| 219 |
+
|
| 220 |
+
# 13. Summary
|
| 221 |
+
logger.info("βββ Pipeline complete βββ")
|
| 222 |
+
logger.info(" Run ID: %s", run_id)
|
| 223 |
+
logger.info(" Records: %d", len(current_records))
|
| 224 |
+
logger.info(" DB: %s", db_path)
|
| 225 |
+
logger.info(" Report: %s", run_dir / "report" / "index.html")
|
| 226 |
+
logger.info(" Exports: %s", run_dir / "exports")
|
| 227 |
+
|
| 228 |
+
ctx.stage_results = stage_outputs
|
| 229 |
+
return ctx
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ---------------------------------------------------------------------------
|
| 233 |
+
# CLI entrypoint
|
| 234 |
+
# ---------------------------------------------------------------------------
|
| 235 |
+
|
| 236 |
+
def main() -> None:
|
| 237 |
+
parser = argparse.ArgumentParser(
|
| 238 |
+
description="MOD-OSINT Pipeline Orchestrator",
|
| 239 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 240 |
+
epilog="""
|
| 241 |
+
Examples:
|
| 242 |
+
python -m engine.pipeline_orchestrator --input samples/demo_ingest/
|
| 243 |
+
python -m engine.pipeline_orchestrator --input data/case_001.csv
|
| 244 |
+
""",
|
| 245 |
+
)
|
| 246 |
+
parser.add_argument(
|
| 247 |
+
"--input", "-i",
|
| 248 |
+
required=True,
|
| 249 |
+
help="Path to input directory or single file",
|
| 250 |
+
)
|
| 251 |
+
parser.add_argument(
|
| 252 |
+
"--config", "-c",
|
| 253 |
+
default=None,
|
| 254 |
+
help="Path to JSON config overrides",
|
| 255 |
+
)
|
| 256 |
+
parser.add_argument(
|
| 257 |
+
"--runs-dir",
|
| 258 |
+
default="runs",
|
| 259 |
+
help="Base directory for run outputs (default: runs/)",
|
| 260 |
+
)
|
| 261 |
+
args = parser.parse_args()
|
| 262 |
+
|
| 263 |
+
config = {}
|
| 264 |
+
if args.config:
|
| 265 |
+
config = json.loads(Path(args.config).read_text())
|
| 266 |
+
|
| 267 |
+
run_pipeline(
|
| 268 |
+
input_path=args.input,
|
| 269 |
+
config=config,
|
| 270 |
+
runs_base=Path(args.runs_dir),
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
if __name__ == "__main__":
|
| 275 |
+
main()
|
engine/registry.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Module Registry β discovery, registration, and dependency ordering.
|
| 3 |
+
|
| 4 |
+
The engine discovers pipeline modules from ``modules/`` (lowercase only).
|
| 5 |
+
Each module that wants to participate in the pipeline must either:
|
| 6 |
+
|
| 7 |
+
1. Be listed in ``PIPELINE_STAGES`` (the built-in ordered list), or
|
| 8 |
+
2. Register itself via the ``@pipeline_stage`` decorator.
|
| 9 |
+
|
| 10 |
+
The registry enforces that every registered module exposes a
|
| 11 |
+
``run(input: EngineInput) -> EngineOutput`` callable.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import importlib
|
| 17 |
+
import logging
|
| 18 |
+
from collections import OrderedDict
|
| 19 |
+
from typing import Callable, Dict, List, Optional, Protocol
|
| 20 |
+
|
| 21 |
+
from engine.io_contract import EngineInput, EngineOutput
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger("engine.registry")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
# Protocol that every pipeline module must satisfy
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
class PipelineModule(Protocol):
|
| 31 |
+
"""Structural type for a pipeline module."""
|
| 32 |
+
|
| 33 |
+
def run(self, engine_input: EngineInput) -> EngineOutput: ...
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Built-in pipeline stage ordering
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
PIPELINE_STAGES: List[Dict[str, str]] = [
|
| 41 |
+
{"name": "ingestion", "module_path": "modules.ingestion.ingest_data"},
|
| 42 |
+
{"name": "preprocessing", "module_path": "modules.preprocessing.preprocess_data"},
|
| 43 |
+
{"name": "analysis", "module_path": "modules.ml_analysis.ml_analysis"},
|
| 44 |
+
{"name": "correlation", "module_path": "modules.correlation.correlate"},
|
| 45 |
+
{"name": "export", "module_path": "modules.export.export_results"},
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Registry singleton
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
_registry: OrderedDict[str, Callable[[EngineInput], EngineOutput]] = OrderedDict()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def register(name: str, run_fn: Callable[[EngineInput], EngineOutput]) -> None:
|
| 57 |
+
"""Register a module's run function under *name*."""
|
| 58 |
+
if name in _registry:
|
| 59 |
+
logger.warning("Overwriting existing registration for stage '%s'", name)
|
| 60 |
+
_registry[name] = run_fn
|
| 61 |
+
logger.info("Registered pipeline stage: %s", name)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def get_stage(name: str) -> Optional[Callable[[EngineInput], EngineOutput]]:
|
| 65 |
+
"""Return the run function for *name*, or ``None``."""
|
| 66 |
+
return _registry.get(name)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_ordered_stages() -> List[str]:
|
| 70 |
+
"""Return stage names in pipeline execution order."""
|
| 71 |
+
return list(_registry.keys())
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def clear() -> None:
|
| 75 |
+
"""Clear all registrations (useful for testing)."""
|
| 76 |
+
_registry.clear()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# Decorator for ad-hoc registration
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
|
| 83 |
+
def pipeline_stage(name: str):
|
| 84 |
+
"""
|
| 85 |
+
Decorator to register a function as a pipeline stage::
|
| 86 |
+
|
| 87 |
+
@pipeline_stage("my_stage")
|
| 88 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 89 |
+
...
|
| 90 |
+
"""
|
| 91 |
+
def decorator(fn: Callable[[EngineInput], EngineOutput]):
|
| 92 |
+
register(name, fn)
|
| 93 |
+
return fn
|
| 94 |
+
return decorator
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
# Auto-discovery from PIPELINE_STAGES
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
|
| 101 |
+
def discover_and_register() -> List[str]:
|
| 102 |
+
"""
|
| 103 |
+
Import each module listed in ``PIPELINE_STAGES`` and register its
|
| 104 |
+
``run`` function. Returns the list of successfully registered stage names.
|
| 105 |
+
|
| 106 |
+
Only discovers from ``modules/`` (lowercase). The uppercase ``Modules/``
|
| 107 |
+
directory is explicitly excluded.
|
| 108 |
+
"""
|
| 109 |
+
registered: List[str] = []
|
| 110 |
+
for stage_def in PIPELINE_STAGES:
|
| 111 |
+
name = stage_def["name"]
|
| 112 |
+
module_path = stage_def["module_path"]
|
| 113 |
+
try:
|
| 114 |
+
mod = importlib.import_module(module_path)
|
| 115 |
+
run_fn = getattr(mod, "run", None)
|
| 116 |
+
if run_fn is None:
|
| 117 |
+
logger.error(
|
| 118 |
+
"Module '%s' (%s) has no run() function β skipping",
|
| 119 |
+
name, module_path,
|
| 120 |
+
)
|
| 121 |
+
continue
|
| 122 |
+
register(name, run_fn)
|
| 123 |
+
registered.append(name)
|
| 124 |
+
except ImportError as exc:
|
| 125 |
+
logger.error(
|
| 126 |
+
"Failed to import module '%s' (%s): %s",
|
| 127 |
+
name, module_path, exc,
|
| 128 |
+
)
|
| 129 |
+
except Exception as exc:
|
| 130 |
+
logger.error(
|
| 131 |
+
"Unexpected error loading '%s' (%s): %s",
|
| 132 |
+
name, module_path, exc,
|
| 133 |
+
)
|
| 134 |
+
return registered
|
engine/reporting.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reporting β HTML report generation + JSONL/CSV exports.
|
| 3 |
+
|
| 4 |
+
Produces:
|
| 5 |
+
runs/<run_id>/report/index.html β human-browsable report
|
| 6 |
+
runs/<run_id>/exports/normalized.jsonl
|
| 7 |
+
runs/<run_id>/exports/records.csv
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import csv
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any, Dict, List
|
| 18 |
+
|
| 19 |
+
from engine.io_contract import (
|
| 20 |
+
Artifact,
|
| 21 |
+
EngineOutput,
|
| 22 |
+
InputSpec,
|
| 23 |
+
NormalizedRecord,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("engine.reporting")
|
| 27 |
+
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
# Jinja2 setup (lazy import so the module can be imported without jinja2)
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
|
| 32 |
+
_TEMPLATE_DIR = Path(__file__).parent / "templates"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _render_html(template_name: str, context: Dict[str, Any]) -> str:
|
| 36 |
+
"""Render a Jinja2 template from the engine/templates/ directory."""
|
| 37 |
+
try:
|
| 38 |
+
from jinja2 import Environment, FileSystemLoader
|
| 39 |
+
except ImportError:
|
| 40 |
+
logger.warning("jinja2 not installed β HTML report will be a plain summary")
|
| 41 |
+
return _fallback_html(context)
|
| 42 |
+
|
| 43 |
+
env = Environment(
|
| 44 |
+
loader=FileSystemLoader(str(_TEMPLATE_DIR)),
|
| 45 |
+
autoescape=True,
|
| 46 |
+
)
|
| 47 |
+
template = env.get_template(template_name)
|
| 48 |
+
return template.render(**context)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _fallback_html(context: Dict[str, Any]) -> str:
|
| 52 |
+
"""Minimal HTML when Jinja2 is unavailable."""
|
| 53 |
+
return (
|
| 54 |
+
f"<html><body><h1>MOD-OSINT Report β {context.get('run_id', '?')}</h1>"
|
| 55 |
+
f"<p>Records: {context.get('total_records', 0)}</p>"
|
| 56 |
+
f"<p>Generated: {context.get('generated_at', '')}</p>"
|
| 57 |
+
f"<p><em>Install jinja2 for the full HTML report.</em></p>"
|
| 58 |
+
f"</body></html>"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Export helpers
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path:
|
| 67 |
+
"""Write records as newline-delimited JSON."""
|
| 68 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 70 |
+
for r in records:
|
| 71 |
+
f.write(r.model_dump_json() + "\n")
|
| 72 |
+
logger.info("Exported %d records to %s", len(records), out_path)
|
| 73 |
+
return out_path
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path:
|
| 77 |
+
"""Write records as CSV."""
|
| 78 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 79 |
+
if not records:
|
| 80 |
+
out_path.write_text("")
|
| 81 |
+
return out_path
|
| 82 |
+
|
| 83 |
+
fieldnames = list(records[0].model_dump().keys())
|
| 84 |
+
with open(out_path, "w", newline="", encoding="utf-8") as f:
|
| 85 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 86 |
+
writer.writeheader()
|
| 87 |
+
for r in records:
|
| 88 |
+
row = r.model_dump()
|
| 89 |
+
# Serialize complex fields
|
| 90 |
+
for k, v in row.items():
|
| 91 |
+
if isinstance(v, (dict, list)):
|
| 92 |
+
row[k] = json.dumps(v, ensure_ascii=False, default=str)
|
| 93 |
+
elif isinstance(v, Path):
|
| 94 |
+
row[k] = str(v)
|
| 95 |
+
elif v is None:
|
| 96 |
+
row[k] = ""
|
| 97 |
+
writer.writerow(row)
|
| 98 |
+
logger.info("Exported %d records to %s", len(records), out_path)
|
| 99 |
+
return out_path
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# Main report generation
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
def generate_report(
|
| 107 |
+
run_id: str,
|
| 108 |
+
run_dir: Path,
|
| 109 |
+
input_spec: InputSpec,
|
| 110 |
+
records: List[NormalizedRecord],
|
| 111 |
+
stage_outputs: Dict[str, EngineOutput],
|
| 112 |
+
) -> List[Artifact]:
|
| 113 |
+
"""
|
| 114 |
+
Generate the full report suite:
|
| 115 |
+
- HTML report at ``run_dir/report/index.html``
|
| 116 |
+
- JSONL export at ``run_dir/exports/normalized.jsonl``
|
| 117 |
+
- CSV export at ``run_dir/exports/records.csv``
|
| 118 |
+
|
| 119 |
+
Returns a list of ``Artifact`` objects.
|
| 120 |
+
"""
|
| 121 |
+
report_dir = run_dir / "report"
|
| 122 |
+
exports_dir = run_dir / "exports"
|
| 123 |
+
report_dir.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
exports_dir.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
artifacts: List[Artifact] = []
|
| 127 |
+
|
| 128 |
+
# -- JSONL export --------------------------------------------------------
|
| 129 |
+
jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl")
|
| 130 |
+
artifacts.append(Artifact(
|
| 131 |
+
name="normalized.jsonl",
|
| 132 |
+
path=jsonl_path,
|
| 133 |
+
mime_type="application/jsonl",
|
| 134 |
+
description="All normalized records in JSONL format",
|
| 135 |
+
))
|
| 136 |
+
|
| 137 |
+
# -- CSV export ----------------------------------------------------------
|
| 138 |
+
csv_path = export_csv(records, exports_dir / "records.csv")
|
| 139 |
+
artifacts.append(Artifact(
|
| 140 |
+
name="records.csv",
|
| 141 |
+
path=csv_path,
|
| 142 |
+
mime_type="text/csv",
|
| 143 |
+
description="All normalized records in CSV format",
|
| 144 |
+
))
|
| 145 |
+
|
| 146 |
+
# -- HTML report ---------------------------------------------------------
|
| 147 |
+
preview_limit = 50
|
| 148 |
+
stages_data = []
|
| 149 |
+
for name, out in stage_outputs.items():
|
| 150 |
+
stages_data.append({
|
| 151 |
+
"stage": name,
|
| 152 |
+
"status": out.status.value,
|
| 153 |
+
"summary": out.summary,
|
| 154 |
+
"error": out.error,
|
| 155 |
+
})
|
| 156 |
+
|
| 157 |
+
input_files_data = []
|
| 158 |
+
for f in input_spec.files:
|
| 159 |
+
input_files_data.append({
|
| 160 |
+
"name": f.path.name,
|
| 161 |
+
"file_type": f.file_type.value,
|
| 162 |
+
"size_bytes": f.size_bytes,
|
| 163 |
+
"sha256": f.sha256,
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
records_preview = []
|
| 167 |
+
for r in records[:preview_limit]:
|
| 168 |
+
d = r.model_dump()
|
| 169 |
+
# Convert Path/enum to string for template
|
| 170 |
+
d["source_type"] = d.get("source_type", "")
|
| 171 |
+
if hasattr(d["source_type"], "value"):
|
| 172 |
+
d["source_type"] = d["source_type"].value
|
| 173 |
+
records_preview.append(d)
|
| 174 |
+
|
| 175 |
+
# Build relative paths for download links
|
| 176 |
+
artifacts_data = []
|
| 177 |
+
for a in artifacts:
|
| 178 |
+
try:
|
| 179 |
+
rel = a.path.relative_to(report_dir)
|
| 180 |
+
except ValueError:
|
| 181 |
+
rel = Path("..") / "exports" / a.path.name
|
| 182 |
+
artifacts_data.append({"name": a.name, "rel_path": str(rel)})
|
| 183 |
+
|
| 184 |
+
context = {
|
| 185 |
+
"run_id": run_id,
|
| 186 |
+
"generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
|
| 187 |
+
"total_records": len(records),
|
| 188 |
+
"input_file_count": len(input_spec.files),
|
| 189 |
+
"stages": stages_data,
|
| 190 |
+
"input_files": input_files_data,
|
| 191 |
+
"records_preview": records_preview,
|
| 192 |
+
"preview_limit": preview_limit,
|
| 193 |
+
"artifacts": artifacts_data,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
html_content = _render_html("report.html", context)
|
| 197 |
+
html_path = report_dir / "index.html"
|
| 198 |
+
html_path.write_text(html_content, encoding="utf-8")
|
| 199 |
+
logger.info("HTML report written to %s", html_path)
|
| 200 |
+
|
| 201 |
+
artifacts.append(Artifact(
|
| 202 |
+
name="index.html",
|
| 203 |
+
path=html_path,
|
| 204 |
+
mime_type="text/html",
|
| 205 |
+
description="Human-browsable pipeline report",
|
| 206 |
+
))
|
| 207 |
+
|
| 208 |
+
return artifacts
|
engine/storage.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Storage β SQL persistence for pipeline runs.
|
| 3 |
+
|
| 4 |
+
Uses SQLite by default (``runs/<run_id>/db.sqlite``).
|
| 5 |
+
Set ``DATABASE_URL`` env var for Postgres (e.g. ``postgresql://user:pass@host/db``).
|
| 6 |
+
|
| 7 |
+
Tables:
|
| 8 |
+
normalized_records β all NormalizedRecord rows
|
| 9 |
+
stage_results β per-stage summary + status
|
| 10 |
+
artifacts β artifact metadata
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import logging
|
| 17 |
+
import os
|
| 18 |
+
import sqlite3
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import List, Optional
|
| 21 |
+
|
| 22 |
+
from engine.io_contract import Artifact, EngineOutput, NormalizedRecord
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger("engine.storage")
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
# Schema DDL (SQLite-compatible, works with Postgres too)
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
_DDL = """
|
| 31 |
+
CREATE TABLE IF NOT EXISTS normalized_records (
|
| 32 |
+
row_id TEXT PRIMARY KEY,
|
| 33 |
+
source_file TEXT,
|
| 34 |
+
source_type TEXT,
|
| 35 |
+
timestamp TEXT,
|
| 36 |
+
entity_name TEXT,
|
| 37 |
+
entity_phone TEXT,
|
| 38 |
+
entity_email TEXT,
|
| 39 |
+
entity_ip TEXT,
|
| 40 |
+
entity_domain TEXT,
|
| 41 |
+
entity_hash TEXT,
|
| 42 |
+
raw_text TEXT,
|
| 43 |
+
extra TEXT
|
| 44 |
+
);
|
| 45 |
+
|
| 46 |
+
CREATE TABLE IF NOT EXISTS stage_results (
|
| 47 |
+
stage TEXT PRIMARY KEY,
|
| 48 |
+
status TEXT,
|
| 49 |
+
summary TEXT,
|
| 50 |
+
error TEXT,
|
| 51 |
+
metadata TEXT
|
| 52 |
+
);
|
| 53 |
+
|
| 54 |
+
CREATE TABLE IF NOT EXISTS artifacts (
|
| 55 |
+
name TEXT PRIMARY KEY,
|
| 56 |
+
path TEXT,
|
| 57 |
+
mime_type TEXT,
|
| 58 |
+
description TEXT
|
| 59 |
+
);
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
# Storage backend
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
|
| 67 |
+
class StorageBackend:
|
| 68 |
+
"""
|
| 69 |
+
Thin wrapper around a SQLite (or Postgres) connection.
|
| 70 |
+
|
| 71 |
+
For this iteration we use raw ``sqlite3``. A future iteration can
|
| 72 |
+
swap in SQLAlchemy / SQLModel for Postgres parity.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
def __init__(self, db_path: Path):
|
| 76 |
+
self.db_path = db_path
|
| 77 |
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
self._conn: Optional[sqlite3.Connection] = None
|
| 79 |
+
|
| 80 |
+
# -- lifecycle -----------------------------------------------------------
|
| 81 |
+
|
| 82 |
+
def connect(self) -> None:
|
| 83 |
+
logger.info("Connecting to SQLite: %s", self.db_path)
|
| 84 |
+
self._conn = sqlite3.connect(str(self.db_path))
|
| 85 |
+
self._conn.executescript(_DDL)
|
| 86 |
+
self._conn.commit()
|
| 87 |
+
|
| 88 |
+
def close(self) -> None:
|
| 89 |
+
if self._conn:
|
| 90 |
+
self._conn.close()
|
| 91 |
+
self._conn = None
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def conn(self) -> sqlite3.Connection:
|
| 95 |
+
if self._conn is None:
|
| 96 |
+
self.connect()
|
| 97 |
+
assert self._conn is not None
|
| 98 |
+
return self._conn
|
| 99 |
+
|
| 100 |
+
# -- writes --------------------------------------------------------------
|
| 101 |
+
|
| 102 |
+
def insert_records(self, records: List[NormalizedRecord]) -> int:
|
| 103 |
+
"""Insert normalized records. Returns count inserted."""
|
| 104 |
+
if not records:
|
| 105 |
+
return 0
|
| 106 |
+
sql = """
|
| 107 |
+
INSERT OR REPLACE INTO normalized_records
|
| 108 |
+
(row_id, source_file, source_type, timestamp,
|
| 109 |
+
entity_name, entity_phone, entity_email,
|
| 110 |
+
entity_ip, entity_domain, entity_hash,
|
| 111 |
+
raw_text, extra)
|
| 112 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 113 |
+
"""
|
| 114 |
+
rows = [
|
| 115 |
+
(
|
| 116 |
+
r.row_id,
|
| 117 |
+
r.source_file,
|
| 118 |
+
r.source_type.value if r.source_type else "",
|
| 119 |
+
r.timestamp.isoformat() if r.timestamp else None,
|
| 120 |
+
r.entity_name,
|
| 121 |
+
r.entity_phone,
|
| 122 |
+
r.entity_email,
|
| 123 |
+
r.entity_ip,
|
| 124 |
+
r.entity_domain,
|
| 125 |
+
r.entity_hash,
|
| 126 |
+
r.raw_text,
|
| 127 |
+
json.dumps(r.extra, ensure_ascii=False, default=str),
|
| 128 |
+
)
|
| 129 |
+
for r in records
|
| 130 |
+
]
|
| 131 |
+
self.conn.executemany(sql, rows)
|
| 132 |
+
self.conn.commit()
|
| 133 |
+
logger.info("Inserted %d normalized records", len(rows))
|
| 134 |
+
return len(rows)
|
| 135 |
+
|
| 136 |
+
def insert_stage_result(self, output: EngineOutput) -> None:
|
| 137 |
+
"""Upsert a stage result row."""
|
| 138 |
+
sql = """
|
| 139 |
+
INSERT OR REPLACE INTO stage_results
|
| 140 |
+
(stage, status, summary, error, metadata)
|
| 141 |
+
VALUES (?, ?, ?, ?, ?)
|
| 142 |
+
"""
|
| 143 |
+
self.conn.execute(sql, (
|
| 144 |
+
output.stage,
|
| 145 |
+
output.status.value,
|
| 146 |
+
output.summary,
|
| 147 |
+
output.error,
|
| 148 |
+
json.dumps(output.metadata, ensure_ascii=False, default=str),
|
| 149 |
+
))
|
| 150 |
+
self.conn.commit()
|
| 151 |
+
|
| 152 |
+
def insert_artifact(self, artifact: Artifact) -> None:
|
| 153 |
+
"""Upsert an artifact metadata row."""
|
| 154 |
+
sql = """
|
| 155 |
+
INSERT OR REPLACE INTO artifacts
|
| 156 |
+
(name, path, mime_type, description)
|
| 157 |
+
VALUES (?, ?, ?, ?)
|
| 158 |
+
"""
|
| 159 |
+
self.conn.execute(sql, (
|
| 160 |
+
artifact.name,
|
| 161 |
+
str(artifact.path),
|
| 162 |
+
artifact.mime_type,
|
| 163 |
+
artifact.description,
|
| 164 |
+
))
|
| 165 |
+
self.conn.commit()
|
| 166 |
+
|
| 167 |
+
# -- reads ---------------------------------------------------------------
|
| 168 |
+
|
| 169 |
+
def count_records(self) -> int:
|
| 170 |
+
cur = self.conn.execute("SELECT COUNT(*) FROM normalized_records")
|
| 171 |
+
return cur.fetchone()[0]
|
| 172 |
+
|
| 173 |
+
def fetch_all_records(self) -> List[dict]:
|
| 174 |
+
"""Return all normalized records as dicts."""
|
| 175 |
+
cur = self.conn.execute("SELECT * FROM normalized_records")
|
| 176 |
+
cols = [d[0] for d in cur.description]
|
| 177 |
+
return [dict(zip(cols, row)) for row in cur.fetchall()]
|
| 178 |
+
|
| 179 |
+
def fetch_stage_results(self) -> List[dict]:
|
| 180 |
+
cur = self.conn.execute("SELECT * FROM stage_results")
|
| 181 |
+
cols = [d[0] for d in cur.description]
|
| 182 |
+
return [dict(zip(cols, row)) for row in cur.fetchall()]
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def create_storage(db_path: Path) -> StorageBackend:
|
| 186 |
+
"""Factory: create and connect a StorageBackend."""
|
| 187 |
+
backend = StorageBackend(db_path)
|
| 188 |
+
backend.connect()
|
| 189 |
+
return backend
|
engine/templates/report.html
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>MOD-OSINT Report β {{ run_id }}</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg: #0d1117;
|
| 10 |
+
--surface: #161b22;
|
| 11 |
+
--border: #30363d;
|
| 12 |
+
--text: #c9d1d9;
|
| 13 |
+
--accent: #58a6ff;
|
| 14 |
+
--success: #3fb950;
|
| 15 |
+
--error: #f85149;
|
| 16 |
+
--warn: #d29922;
|
| 17 |
+
}
|
| 18 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 19 |
+
body {
|
| 20 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
|
| 21 |
+
background: var(--bg);
|
| 22 |
+
color: var(--text);
|
| 23 |
+
padding: 2rem;
|
| 24 |
+
line-height: 1.6;
|
| 25 |
+
}
|
| 26 |
+
h1, h2, h3 { color: var(--accent); margin-bottom: 0.5rem; }
|
| 27 |
+
h1 { font-size: 1.8rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
|
| 28 |
+
h2 { font-size: 1.3rem; margin-top: 2rem; }
|
| 29 |
+
.meta { color: #8b949e; font-size: 0.9rem; margin-bottom: 1.5rem; }
|
| 30 |
+
.card {
|
| 31 |
+
background: var(--surface);
|
| 32 |
+
border: 1px solid var(--border);
|
| 33 |
+
border-radius: 6px;
|
| 34 |
+
padding: 1rem 1.2rem;
|
| 35 |
+
margin-bottom: 1rem;
|
| 36 |
+
}
|
| 37 |
+
.badge {
|
| 38 |
+
display: inline-block;
|
| 39 |
+
padding: 2px 8px;
|
| 40 |
+
border-radius: 12px;
|
| 41 |
+
font-size: 0.8rem;
|
| 42 |
+
font-weight: 600;
|
| 43 |
+
}
|
| 44 |
+
.badge-success { background: var(--success); color: #000; }
|
| 45 |
+
.badge-failed { background: var(--error); color: #fff; }
|
| 46 |
+
.badge-skipped { background: var(--warn); color: #000; }
|
| 47 |
+
table {
|
| 48 |
+
width: 100%;
|
| 49 |
+
border-collapse: collapse;
|
| 50 |
+
margin-top: 0.5rem;
|
| 51 |
+
font-size: 0.85rem;
|
| 52 |
+
}
|
| 53 |
+
th, td {
|
| 54 |
+
text-align: left;
|
| 55 |
+
padding: 6px 10px;
|
| 56 |
+
border-bottom: 1px solid var(--border);
|
| 57 |
+
}
|
| 58 |
+
th { color: var(--accent); font-weight: 600; }
|
| 59 |
+
tr:hover { background: rgba(88,166,255,0.05); }
|
| 60 |
+
.truncate { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
| 61 |
+
a { color: var(--accent); text-decoration: none; }
|
| 62 |
+
a:hover { text-decoration: underline; }
|
| 63 |
+
.downloads { display: flex; gap: 1rem; flex-wrap: wrap; margin-top: 1rem; }
|
| 64 |
+
.dl-btn {
|
| 65 |
+
display: inline-block;
|
| 66 |
+
padding: 8px 16px;
|
| 67 |
+
background: var(--accent);
|
| 68 |
+
color: #000;
|
| 69 |
+
border-radius: 6px;
|
| 70 |
+
font-weight: 600;
|
| 71 |
+
font-size: 0.85rem;
|
| 72 |
+
}
|
| 73 |
+
.dl-btn:hover { opacity: 0.85; text-decoration: none; }
|
| 74 |
+
</style>
|
| 75 |
+
</head>
|
| 76 |
+
<body>
|
| 77 |
+
<h1>π΅οΈ MOD-OSINT Pipeline Report</h1>
|
| 78 |
+
<p class="meta">
|
| 79 |
+
Run ID: <strong>{{ run_id }}</strong> |
|
| 80 |
+
Generated: <strong>{{ generated_at }}</strong> |
|
| 81 |
+
Records: <strong>{{ total_records }}</strong> |
|
| 82 |
+
Input files: <strong>{{ input_file_count }}</strong>
|
| 83 |
+
</p>
|
| 84 |
+
|
| 85 |
+
<!-- Stage Results -->
|
| 86 |
+
<h2>Pipeline Stages</h2>
|
| 87 |
+
{% for stage in stages %}
|
| 88 |
+
<div class="card">
|
| 89 |
+
<strong>{{ stage.stage }}</strong>
|
| 90 |
+
{% if stage.status == "success" %}
|
| 91 |
+
<span class="badge badge-success">β success</span>
|
| 92 |
+
{% elif stage.status == "failed" %}
|
| 93 |
+
<span class="badge badge-failed">β failed</span>
|
| 94 |
+
{% else %}
|
| 95 |
+
<span class="badge badge-skipped">β {{ stage.status }}</span>
|
| 96 |
+
{% endif %}
|
| 97 |
+
{% if stage.summary %}
|
| 98 |
+
<p style="margin-top:0.4rem; font-size:0.9rem;">{{ stage.summary }}</p>
|
| 99 |
+
{% endif %}
|
| 100 |
+
{% if stage.error %}
|
| 101 |
+
<p style="margin-top:0.4rem; color:var(--error); font-size:0.85rem;">Error: {{ stage.error }}</p>
|
| 102 |
+
{% endif %}
|
| 103 |
+
</div>
|
| 104 |
+
{% endfor %}
|
| 105 |
+
|
| 106 |
+
<!-- Input Files -->
|
| 107 |
+
<h2>Input Files</h2>
|
| 108 |
+
<div class="card">
|
| 109 |
+
<table>
|
| 110 |
+
<thead><tr><th>File</th><th>Type</th><th>Size</th><th>SHA-256</th></tr></thead>
|
| 111 |
+
<tbody>
|
| 112 |
+
{% for f in input_files %}
|
| 113 |
+
<tr>
|
| 114 |
+
<td>{{ f.name }}</td>
|
| 115 |
+
<td>{{ f.file_type }}</td>
|
| 116 |
+
<td>{{ f.size_bytes }} B</td>
|
| 117 |
+
<td class="truncate" title="{{ f.sha256 }}">{{ f.sha256[:16] }}β¦</td>
|
| 118 |
+
</tr>
|
| 119 |
+
{% endfor %}
|
| 120 |
+
</tbody>
|
| 121 |
+
</table>
|
| 122 |
+
</div>
|
| 123 |
+
|
| 124 |
+
<!-- Records Preview -->
|
| 125 |
+
<h2>Normalized Records (first {{ preview_limit }})</h2>
|
| 126 |
+
<div class="card" style="overflow-x:auto;">
|
| 127 |
+
<table>
|
| 128 |
+
<thead>
|
| 129 |
+
<tr>
|
| 130 |
+
<th>row_id</th>
|
| 131 |
+
<th>source</th>
|
| 132 |
+
<th>type</th>
|
| 133 |
+
<th>name</th>
|
| 134 |
+
<th>phone</th>
|
| 135 |
+
<th>email</th>
|
| 136 |
+
<th>ip</th>
|
| 137 |
+
<th>domain</th>
|
| 138 |
+
<th>raw_text</th>
|
| 139 |
+
</tr>
|
| 140 |
+
</thead>
|
| 141 |
+
<tbody>
|
| 142 |
+
{% for r in records_preview %}
|
| 143 |
+
<tr>
|
| 144 |
+
<td><code>{{ r.row_id }}</code></td>
|
| 145 |
+
<td>{{ r.source_file }}</td>
|
| 146 |
+
<td>{{ r.source_type }}</td>
|
| 147 |
+
<td>{{ r.entity_name or '' }}</td>
|
| 148 |
+
<td>{{ r.entity_phone or '' }}</td>
|
| 149 |
+
<td>{{ r.entity_email or '' }}</td>
|
| 150 |
+
<td>{{ r.entity_ip or '' }}</td>
|
| 151 |
+
<td>{{ r.entity_domain or '' }}</td>
|
| 152 |
+
<td class="truncate" title="{{ r.raw_text }}">{{ r.raw_text[:80] }}</td>
|
| 153 |
+
</tr>
|
| 154 |
+
{% endfor %}
|
| 155 |
+
</tbody>
|
| 156 |
+
</table>
|
| 157 |
+
</div>
|
| 158 |
+
|
| 159 |
+
<!-- Downloads -->
|
| 160 |
+
<h2>Downloadable Artifacts</h2>
|
| 161 |
+
<div class="downloads">
|
| 162 |
+
{% for a in artifacts %}
|
| 163 |
+
<a class="dl-btn" href="{{ a.rel_path }}" download>β¬ {{ a.name }}</a>
|
| 164 |
+
{% endfor %}
|
| 165 |
+
</div>
|
| 166 |
+
|
| 167 |
+
<p class="meta" style="margin-top:3rem;">
|
| 168 |
+
Generated by <strong>MOD-OSINT Engine v0.1.0</strong>
|
| 169 |
+
</p>
|
| 170 |
+
</body>
|
| 171 |
+
</html>
|
gui/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MOD-OSINT GUI package.
|
| 3 |
+
|
| 4 |
+
Streamlit entrypoint: gui/streamlit_app.py
|
| 5 |
+
"""
|
gui/streamlit_app.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MOD-OSINT Streamlit GUI Wizard
|
| 3 |
+
Wired to engine.pipeline_orchestrator.run_pipeline()
|
| 4 |
+
|
| 5 |
+
Stages:
|
| 6 |
+
A β Upload / Input selection
|
| 7 |
+
B β Settings
|
| 8 |
+
C β Run pipeline
|
| 9 |
+
D β Browse / Export results
|
| 10 |
+
|
| 11 |
+
Import safety:
|
| 12 |
+
This module avoids importing Streamlit at module load time so CI/tests can
|
| 13 |
+
import it without ScriptRunContext warnings.
|
| 14 |
+
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import sqlite3
|
| 18 |
+
import tempfile
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import pandas as pd
|
| 22 |
+
|
| 23 |
+
_DEMO_DIR = Path("samples/demo_ingest")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _load_yaml_defaults(path: Path) -> dict:
|
| 27 |
+
try:
|
| 28 |
+
import yaml # optional; provided by requirements-hf.txt
|
| 29 |
+
return yaml.safe_load(path.read_text()) or {}
|
| 30 |
+
except Exception:
|
| 31 |
+
return {}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _write_uploads(uploads) -> Path:
|
| 35 |
+
"""Save uploaded files into a temp dir and return the dir path."""
|
| 36 |
+
tmp = Path(tempfile.mkdtemp(prefix="modosint_"))
|
| 37 |
+
updir = tmp / "uploads"
|
| 38 |
+
updir.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
for file_obj in uploads:
|
| 40 |
+
(updir / file_obj.name).write_bytes(file_obj.getbuffer())
|
| 41 |
+
return updir
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _resolve_input(session_state) -> Path | None:
|
| 45 |
+
"""Determine input from session state (uploads > local path > demo)."""
|
| 46 |
+
uploads = session_state.get("_uploads")
|
| 47 |
+
if uploads:
|
| 48 |
+
return _write_uploads(uploads)
|
| 49 |
+
|
| 50 |
+
local_path = session_state.get("_local_path", "").strip()
|
| 51 |
+
if local_path:
|
| 52 |
+
path_obj = Path(local_path).expanduser()
|
| 53 |
+
if path_obj.exists():
|
| 54 |
+
return path_obj
|
| 55 |
+
|
| 56 |
+
if session_state.get("_use_demo") and _DEMO_DIR.exists():
|
| 57 |
+
return _DEMO_DIR
|
| 58 |
+
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main() -> None:
|
| 63 |
+
"""Entrypoint for `streamlit run gui/streamlit_app.py`."""
|
| 64 |
+
import streamlit as st
|
| 65 |
+
import streamlit.components.v1 as components
|
| 66 |
+
|
| 67 |
+
from engine.pipeline_orchestrator import run_pipeline
|
| 68 |
+
from gui.terminal_panel import render_terminal
|
| 69 |
+
|
| 70 |
+
st.set_page_config(
|
| 71 |
+
page_title="MOD-OSINT",
|
| 72 |
+
page_icon="π§ ",
|
| 73 |
+
layout="wide",
|
| 74 |
+
initial_sidebar_state="expanded",
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
st.title("π§ MOD-OSINT")
|
| 78 |
+
st.caption("GUI wizard -> `engine.pipeline_orchestrator.run_pipeline()`")
|
| 79 |
+
|
| 80 |
+
if "effective_config" not in st.session_state:
|
| 81 |
+
st.session_state["effective_config"] = {}
|
| 82 |
+
if "last_run_id" not in st.session_state:
|
| 83 |
+
st.session_state["last_run_id"] = None
|
| 84 |
+
if "last_run_dir" not in st.session_state:
|
| 85 |
+
st.session_state["last_run_dir"] = None
|
| 86 |
+
|
| 87 |
+
with st.sidebar:
|
| 88 |
+
render_terminal({"effective_config": st.session_state["effective_config"]})
|
| 89 |
+
|
| 90 |
+
tab_upload, tab_settings, tab_run, tab_browse = st.tabs(
|
| 91 |
+
["π Upload", "βοΈ Settings", "βΆοΈ Run", "π Browse"]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
with tab_upload:
|
| 95 |
+
st.subheader("A) Upload or select input")
|
| 96 |
+
|
| 97 |
+
uploads = st.file_uploader(
|
| 98 |
+
"Upload files (CSV, JSON, TXT, HTML, LOG)",
|
| 99 |
+
accept_multiple_files=True,
|
| 100 |
+
key="_uploads",
|
| 101 |
+
)
|
| 102 |
+
if uploads:
|
| 103 |
+
st.success(f"Queued {len(uploads)} file(s): {[u.name for u in uploads]}")
|
| 104 |
+
|
| 105 |
+
st.divider()
|
| 106 |
+
|
| 107 |
+
local_path = st.text_input(
|
| 108 |
+
"Or enter a local directory / file path",
|
| 109 |
+
value="",
|
| 110 |
+
key="_local_path",
|
| 111 |
+
placeholder="/path/to/data/",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
st.divider()
|
| 115 |
+
|
| 116 |
+
st.checkbox(
|
| 117 |
+
f"Use built-in demo dataset (`{_DEMO_DIR}`)",
|
| 118 |
+
value=not bool(uploads) and not bool(local_path),
|
| 119 |
+
key="_use_demo",
|
| 120 |
+
disabled=not _DEMO_DIR.exists(),
|
| 121 |
+
help="Runs the pipeline against samples/demo_ingest/ for quick smoke testing.",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if _DEMO_DIR.exists():
|
| 125 |
+
demo_files = sorted(_DEMO_DIR.iterdir())
|
| 126 |
+
st.caption(f"Demo files: {[f.name for f in demo_files if f.is_file()]}")
|
| 127 |
+
else:
|
| 128 |
+
st.caption("`samples/demo_ingest/` not found in working directory.")
|
| 129 |
+
|
| 130 |
+
with tab_settings:
|
| 131 |
+
st.subheader("B) Pipeline settings")
|
| 132 |
+
|
| 133 |
+
cfg_path = Path("pipeline_config.yaml")
|
| 134 |
+
defaults = _load_yaml_defaults(cfg_path) if cfg_path.exists() else {}
|
| 135 |
+
|
| 136 |
+
col_left, col_right = st.columns(2)
|
| 137 |
+
with col_left:
|
| 138 |
+
offline_mode = st.toggle(
|
| 139 |
+
"offline_mode",
|
| 140 |
+
value=True,
|
| 141 |
+
help="Disable all outbound network calls.",
|
| 142 |
+
)
|
| 143 |
+
enable_ml = st.toggle(
|
| 144 |
+
"enable_ml_analysis",
|
| 145 |
+
value=False,
|
| 146 |
+
help="Enable ML/NLP stage (requires torch; off by default).",
|
| 147 |
+
)
|
| 148 |
+
with col_right:
|
| 149 |
+
correlation_mode = st.selectbox(
|
| 150 |
+
"correlation_mode",
|
| 151 |
+
["basic", "in-memory"],
|
| 152 |
+
index=0,
|
| 153 |
+
help="basic = simple entity matching; in-memory = graph in RAM.",
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
effective_config: dict = defaults.copy()
|
| 157 |
+
effective_config.setdefault("runtime", {})
|
| 158 |
+
effective_config["runtime"].update(
|
| 159 |
+
{
|
| 160 |
+
"offline_mode": offline_mode,
|
| 161 |
+
"enable_ml_analysis": enable_ml,
|
| 162 |
+
"correlation_mode": correlation_mode,
|
| 163 |
+
}
|
| 164 |
+
)
|
| 165 |
+
st.session_state["effective_config"] = effective_config
|
| 166 |
+
|
| 167 |
+
st.markdown("**Effective config (passed to engine):**")
|
| 168 |
+
st.json(effective_config)
|
| 169 |
+
|
| 170 |
+
with tab_run:
|
| 171 |
+
st.subheader("C) Run pipeline")
|
| 172 |
+
st.caption("Outputs are written to `runs/<run_id>/`.")
|
| 173 |
+
|
| 174 |
+
input_path = _resolve_input(st.session_state)
|
| 175 |
+
if input_path:
|
| 176 |
+
st.info(f"Input resolved -> `{input_path}`")
|
| 177 |
+
else:
|
| 178 |
+
st.warning("No input selected. Go to Upload tab or enable demo dataset.")
|
| 179 |
+
|
| 180 |
+
run_btn = st.button("π Run pipeline now", type="primary", disabled=input_path is None)
|
| 181 |
+
|
| 182 |
+
if run_btn and input_path:
|
| 183 |
+
progress = st.progress(0, text="Starting...")
|
| 184 |
+
log_area = st.empty()
|
| 185 |
+
log_lines: list[str] = []
|
| 186 |
+
|
| 187 |
+
def _log(message: str) -> None:
|
| 188 |
+
log_lines.append(message)
|
| 189 |
+
log_area.code("\n".join(log_lines[-40:]), language="bash")
|
| 190 |
+
|
| 191 |
+
_log(f"Input: {input_path}")
|
| 192 |
+
_log("Calling engine.pipeline_orchestrator.run_pipeline()...")
|
| 193 |
+
progress.progress(10, text="Normalizing files...")
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
ctx = run_pipeline(
|
| 197 |
+
input_path=input_path,
|
| 198 |
+
config=st.session_state["effective_config"],
|
| 199 |
+
)
|
| 200 |
+
st.session_state["last_run_id"] = ctx.run_id
|
| 201 |
+
st.session_state["last_run_dir"] = str(ctx.run_dir)
|
| 202 |
+
|
| 203 |
+
progress.progress(90, text="Generating report...")
|
| 204 |
+
_log(f"Run ID: {ctx.run_id}")
|
| 205 |
+
_log(f"Run dir: {ctx.run_dir}")
|
| 206 |
+
|
| 207 |
+
if ctx.stage_results:
|
| 208 |
+
for stage_name, stage_out in ctx.stage_results.items():
|
| 209 |
+
_log(f" [{stage_out.status.value.upper():8s}] {stage_name}")
|
| 210 |
+
|
| 211 |
+
progress.progress(100, text="Done")
|
| 212 |
+
st.success(f"Pipeline complete - run `{ctx.run_id}`")
|
| 213 |
+
st.code(str(ctx.run_dir))
|
| 214 |
+
st.info("Switch to Browse tab to explore outputs.")
|
| 215 |
+
|
| 216 |
+
except Exception as exc:
|
| 217 |
+
progress.empty()
|
| 218 |
+
st.error(f"Pipeline failed: {exc}")
|
| 219 |
+
_log(f"ERROR: {exc}")
|
| 220 |
+
|
| 221 |
+
with tab_browse:
|
| 222 |
+
st.subheader("D) Browse results")
|
| 223 |
+
|
| 224 |
+
run_dir_str = st.session_state.get("last_run_dir")
|
| 225 |
+
if not run_dir_str:
|
| 226 |
+
st.info("Run the pipeline first (Stage C).")
|
| 227 |
+
return
|
| 228 |
+
|
| 229 |
+
run_dir = Path(run_dir_str)
|
| 230 |
+
report_html = run_dir / "report" / "index.html"
|
| 231 |
+
db_path = run_dir / "db.sqlite"
|
| 232 |
+
exports_dir = run_dir / "exports"
|
| 233 |
+
manifest_path = run_dir / "manifest.json"
|
| 234 |
+
|
| 235 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 236 |
+
col1.metric("Run ID", st.session_state.get("last_run_id", "-"))
|
| 237 |
+
col2.metric("Report", "yes" if report_html.exists() else "no")
|
| 238 |
+
col3.metric("DB", "yes" if db_path.exists() else "no")
|
| 239 |
+
col4.metric("Exports", str(len(list(exports_dir.rglob("*"))) if exports_dir.exists() else 0))
|
| 240 |
+
|
| 241 |
+
if manifest_path.exists():
|
| 242 |
+
with st.expander("Run manifest"):
|
| 243 |
+
import json
|
| 244 |
+
st.json(json.loads(manifest_path.read_text()))
|
| 245 |
+
|
| 246 |
+
st.divider()
|
| 247 |
+
|
| 248 |
+
st.markdown("### HTML Report")
|
| 249 |
+
if report_html.exists():
|
| 250 |
+
st.markdown(f"`{report_html}`")
|
| 251 |
+
try:
|
| 252 |
+
components.html(report_html.read_text(errors="replace"), height=700, scrolling=True)
|
| 253 |
+
except Exception as exc:
|
| 254 |
+
st.warning(f"Inline render failed ({exc}). Open the path above in a browser.")
|
| 255 |
+
|
| 256 |
+
with open(report_html, "rb") as file_handle:
|
| 257 |
+
st.download_button(
|
| 258 |
+
"Download report/index.html",
|
| 259 |
+
data=file_handle,
|
| 260 |
+
file_name="index.html",
|
| 261 |
+
mime="text/html",
|
| 262 |
+
)
|
| 263 |
+
else:
|
| 264 |
+
st.info("No report/index.html yet.")
|
| 265 |
+
|
| 266 |
+
st.divider()
|
| 267 |
+
|
| 268 |
+
st.markdown("### Exports")
|
| 269 |
+
if exports_dir.exists():
|
| 270 |
+
export_files = sorted([path for path in exports_dir.rglob("*") if path.is_file()])
|
| 271 |
+
if export_files:
|
| 272 |
+
for export_file in export_files:
|
| 273 |
+
rel = export_file.relative_to(run_dir).as_posix()
|
| 274 |
+
col_path, col_download = st.columns([3, 1])
|
| 275 |
+
col_path.write(f"`{rel}`")
|
| 276 |
+
with open(export_file, "rb") as file_handle:
|
| 277 |
+
col_download.download_button(
|
| 278 |
+
"Download",
|
| 279 |
+
data=file_handle,
|
| 280 |
+
file_name=export_file.name,
|
| 281 |
+
key=f"dl_{rel}",
|
| 282 |
+
)
|
| 283 |
+
else:
|
| 284 |
+
st.info("Exports directory is empty.")
|
| 285 |
+
else:
|
| 286 |
+
st.info("No exports/ directory found.")
|
| 287 |
+
|
| 288 |
+
jsonl_path = run_dir / "normalized.jsonl"
|
| 289 |
+
if jsonl_path.exists():
|
| 290 |
+
with open(jsonl_path, "rb") as file_handle:
|
| 291 |
+
st.download_button(
|
| 292 |
+
"Download normalized.jsonl",
|
| 293 |
+
data=file_handle,
|
| 294 |
+
file_name="normalized.jsonl",
|
| 295 |
+
mime="application/x-ndjson",
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
st.divider()
|
| 299 |
+
|
| 300 |
+
st.markdown("### SQLite DB Preview")
|
| 301 |
+
if not db_path.exists():
|
| 302 |
+
st.info("No db.sqlite found.")
|
| 303 |
+
return
|
| 304 |
+
|
| 305 |
+
with open(db_path, "rb") as file_handle:
|
| 306 |
+
st.download_button(
|
| 307 |
+
"Download db.sqlite",
|
| 308 |
+
data=file_handle,
|
| 309 |
+
file_name="db.sqlite",
|
| 310 |
+
mime="application/x-sqlite3",
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
try:
|
| 314 |
+
conn = sqlite3.connect(db_path)
|
| 315 |
+
tables = pd.read_sql(
|
| 316 |
+
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
|
| 317 |
+
conn,
|
| 318 |
+
)["name"].tolist()
|
| 319 |
+
if tables:
|
| 320 |
+
st.write("Tables:", tables)
|
| 321 |
+
selected_table = st.selectbox("Preview table", tables, key="db_table_sel")
|
| 322 |
+
dataframe = pd.read_sql(f"SELECT * FROM [{selected_table}] LIMIT 200;", conn)
|
| 323 |
+
st.dataframe(dataframe, use_container_width=True)
|
| 324 |
+
else:
|
| 325 |
+
st.info("DB exists but contains no tables yet.")
|
| 326 |
+
conn.close()
|
| 327 |
+
except Exception as exc:
|
| 328 |
+
st.warning(f"DB preview failed: {exc}")
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
if __name__ == "__main__":
|
| 332 |
+
main()
|
gui/terminal_panel.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
ALLOWED = {
|
| 7 |
+
"help",
|
| 8 |
+
"show config",
|
| 9 |
+
"list runs",
|
| 10 |
+
"open last report",
|
| 11 |
+
"tail log",
|
| 12 |
+
"run pipeline",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _st():
|
| 17 |
+
"""Import streamlit lazily so module import stays side-effect free."""
|
| 18 |
+
import streamlit as st
|
| 19 |
+
return st
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _runs_dir() -> Path:
|
| 23 |
+
return Path("runs")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _list_runs(limit: int = 25) -> list[Path]:
|
| 27 |
+
runs_dir = _runs_dir()
|
| 28 |
+
if not runs_dir.exists():
|
| 29 |
+
return []
|
| 30 |
+
runs = [path for path in runs_dir.iterdir() if path.is_dir()]
|
| 31 |
+
runs.sort(key=lambda path: path.name, reverse=True)
|
| 32 |
+
return runs[:limit]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _tail(path: Path, n: int = 200) -> str:
|
| 36 |
+
try:
|
| 37 |
+
lines = path.read_text(errors="replace").splitlines()
|
| 38 |
+
return "\n".join(lines[-n:])
|
| 39 |
+
except Exception as exc:
|
| 40 |
+
return f"[error] {exc}"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def render_terminal(state: dict) -> None:
|
| 44 |
+
st = _st()
|
| 45 |
+
|
| 46 |
+
st.subheader("Terminal (safe mock)")
|
| 47 |
+
st.caption("Allowed: help | show config | list runs | open last report | tail log | run pipeline")
|
| 48 |
+
|
| 49 |
+
out = st.session_state.get("_term_out", "")
|
| 50 |
+
cmd = st.text_input("Command", key="_term_cmd")
|
| 51 |
+
|
| 52 |
+
col1, col2 = st.columns([1, 1])
|
| 53 |
+
run = col1.button("Run", use_container_width=True)
|
| 54 |
+
clear = col2.button("Clear", use_container_width=True)
|
| 55 |
+
|
| 56 |
+
if clear:
|
| 57 |
+
st.session_state["_term_out"] = ""
|
| 58 |
+
st.rerun()
|
| 59 |
+
|
| 60 |
+
if run and cmd:
|
| 61 |
+
cmd_norm = cmd.strip().lower()
|
| 62 |
+
if cmd_norm not in ALLOWED:
|
| 63 |
+
out += f"\n$ {cmd}\n[blocked] command not allowed\n"
|
| 64 |
+
else:
|
| 65 |
+
out += f"\n$ {cmd}\n"
|
| 66 |
+
if cmd_norm == "help":
|
| 67 |
+
out += "help | show config | list runs | open last report | tail log | run pipeline\n"
|
| 68 |
+
elif cmd_norm == "show config":
|
| 69 |
+
out += json.dumps(state.get("effective_config", {}), indent=2) + "\n"
|
| 70 |
+
elif cmd_norm == "list runs":
|
| 71 |
+
runs = _list_runs()
|
| 72 |
+
out += "\n".join([path.name for path in runs]) + ("\n" if runs else "[none]\n")
|
| 73 |
+
elif cmd_norm == "open last report":
|
| 74 |
+
runs = _list_runs(1)
|
| 75 |
+
if not runs:
|
| 76 |
+
out += "[none]\n"
|
| 77 |
+
else:
|
| 78 |
+
report = runs[0] / "report" / "index.html"
|
| 79 |
+
out += f"{report.as_posix()}\n"
|
| 80 |
+
elif cmd_norm == "tail log":
|
| 81 |
+
runs = _list_runs(1)
|
| 82 |
+
if not runs:
|
| 83 |
+
out += "[none]\n"
|
| 84 |
+
else:
|
| 85 |
+
log_path = runs[0] / "pipeline.log"
|
| 86 |
+
out += _tail(log_path) + "\n"
|
| 87 |
+
elif cmd_norm == "run pipeline":
|
| 88 |
+
input_path = None
|
| 89 |
+
uploads = st.session_state.get("_uploads")
|
| 90 |
+
|
| 91 |
+
if uploads:
|
| 92 |
+
import tempfile
|
| 93 |
+
tmp = Path(tempfile.mkdtemp(prefix="modosint_term_"))
|
| 94 |
+
updir = tmp / "uploads"
|
| 95 |
+
updir.mkdir(parents=True, exist_ok=True)
|
| 96 |
+
for file_obj in uploads:
|
| 97 |
+
(updir / file_obj.name).write_bytes(file_obj.getbuffer())
|
| 98 |
+
input_path = updir
|
| 99 |
+
elif st.session_state.get("_local_path", "").strip():
|
| 100 |
+
path_obj = Path(st.session_state["_local_path"].strip()).expanduser()
|
| 101 |
+
if path_obj.exists():
|
| 102 |
+
input_path = path_obj
|
| 103 |
+
elif Path("samples/demo_ingest").exists():
|
| 104 |
+
input_path = Path("samples/demo_ingest")
|
| 105 |
+
|
| 106 |
+
if input_path is None:
|
| 107 |
+
out += "[error] no input available; upload files or enable demo dataset first\n"
|
| 108 |
+
else:
|
| 109 |
+
try:
|
| 110 |
+
from engine.pipeline_orchestrator import run_pipeline
|
| 111 |
+
|
| 112 |
+
ctx = run_pipeline(
|
| 113 |
+
input_path=input_path,
|
| 114 |
+
config=state.get("effective_config", {}),
|
| 115 |
+
)
|
| 116 |
+
st.session_state["last_run_id"] = ctx.run_id
|
| 117 |
+
st.session_state["last_run_dir"] = str(ctx.run_dir)
|
| 118 |
+
out += f"run_id: {ctx.run_id}\nrun_dir: {ctx.run_dir}\n"
|
| 119 |
+
for stage_name, stage_out in ctx.stage_results.items():
|
| 120 |
+
out += f" [{stage_out.status.value.upper():8s}] {stage_name}\n"
|
| 121 |
+
except Exception as exc:
|
| 122 |
+
out += f"[error] {exc}\n"
|
| 123 |
+
|
| 124 |
+
st.session_state["_term_out"] = out
|
| 125 |
+
st.rerun()
|
| 126 |
+
|
| 127 |
+
st.text_area("Output", value=st.session_state.get("_term_out", ""), height=260)
|
modules/README.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MODULES Directory
|
| 2 |
+
|
| 3 |
+
## Purpose:
|
| 4 |
+
This directory houses the functional modules responsible for specific phases of the OSINT pipeline. Each subfolder represents a stage of processing from data ingestion to export.
|
| 5 |
+
|
| 6 |
+
## Subdirectories:
|
| 7 |
+
- `ingestion/`: Connects to external data sources (web scrapers, APIs, sensors).
|
| 8 |
+
- `preprocessing/`: Cleans, deduplicates, and normalizes raw data into structured input.
|
| 9 |
+
- `ml_analysis/`: Applies machine learning models (e.g., classification, clustering, NLP).
|
| 10 |
+
- `correlation/`: Cross-references data (e.g., STIX/IOC matching, pattern detection).
|
| 11 |
+
- `export/`: Packages output into files, databases, APIs, or dashboards.
|
| 12 |
+
|
| 13 |
+
## Design Philosophy:
|
| 14 |
+
Each module is atomic, reusable, and accepts standardized JSON inputs/outputs. Naming follows functional role, and new tools should be added under appropriate phase.
|
modules/__init__.py
ADDED
|
File without changes
|
modules/correlation/correlate.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Correlation module β entity linking and cross-record correlation.
|
| 3 |
+
|
| 4 |
+
Engine contract:
|
| 5 |
+
run(EngineInput) -> EngineOutput
|
| 6 |
+
|
| 7 |
+
Groups records by shared entity keys (email, IP, phone, domain, hash)
|
| 8 |
+
and produces correlation metadata. No external dependencies (Neo4j
|
| 9 |
+
is optional and handled separately).
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import logging
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from typing import Any, Dict, List, Set
|
| 17 |
+
|
| 18 |
+
from engine.io_contract import (
|
| 19 |
+
EngineInput,
|
| 20 |
+
EngineOutput,
|
| 21 |
+
NormalizedRecord,
|
| 22 |
+
StageStatus,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger("modules.correlation")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _build_entity_index(records: List[NormalizedRecord]) -> Dict[str, List[str]]:
|
| 29 |
+
"""
|
| 30 |
+
Build an inverted index: entity_value β [row_ids].
|
| 31 |
+
"""
|
| 32 |
+
index: Dict[str, List[str]] = defaultdict(list)
|
| 33 |
+
for r in records:
|
| 34 |
+
for field in ("entity_email", "entity_ip", "entity_phone",
|
| 35 |
+
"entity_domain", "entity_hash"):
|
| 36 |
+
val = getattr(r, field, None)
|
| 37 |
+
if val:
|
| 38 |
+
key = f"{field}:{val}"
|
| 39 |
+
index[key].append(r.row_id)
|
| 40 |
+
return dict(index)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _find_clusters(index: Dict[str, List[str]]) -> List[Set[str]]:
|
| 44 |
+
"""
|
| 45 |
+
Find clusters of row_ids that share at least one entity value.
|
| 46 |
+
Uses simple union-find.
|
| 47 |
+
"""
|
| 48 |
+
parent: Dict[str, str] = {}
|
| 49 |
+
|
| 50 |
+
def find(x: str) -> str:
|
| 51 |
+
while parent.get(x, x) != x:
|
| 52 |
+
parent[x] = parent.get(parent[x], parent[x])
|
| 53 |
+
x = parent[x]
|
| 54 |
+
return x
|
| 55 |
+
|
| 56 |
+
def union(a: str, b: str) -> None:
|
| 57 |
+
ra, rb = find(a), find(b)
|
| 58 |
+
if ra != rb:
|
| 59 |
+
parent[ra] = rb
|
| 60 |
+
|
| 61 |
+
for entity_key, row_ids in index.items():
|
| 62 |
+
if len(row_ids) > 1:
|
| 63 |
+
first = row_ids[0]
|
| 64 |
+
for rid in row_ids[1:]:
|
| 65 |
+
union(first, rid)
|
| 66 |
+
|
| 67 |
+
clusters: Dict[str, Set[str]] = defaultdict(set)
|
| 68 |
+
all_ids = set()
|
| 69 |
+
for row_ids in index.values():
|
| 70 |
+
all_ids.update(row_ids)
|
| 71 |
+
for rid in all_ids:
|
| 72 |
+
root = find(rid)
|
| 73 |
+
clusters[root].add(rid)
|
| 74 |
+
|
| 75 |
+
# Only return clusters with 2+ members
|
| 76 |
+
return [c for c in clusters.values() if len(c) > 1]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 80 |
+
"""
|
| 81 |
+
Correlate records by shared entity values.
|
| 82 |
+
"""
|
| 83 |
+
try:
|
| 84 |
+
records = engine_input.records
|
| 85 |
+
index = _build_entity_index(records)
|
| 86 |
+
clusters = _find_clusters(index)
|
| 87 |
+
|
| 88 |
+
# Annotate records with cluster IDs
|
| 89 |
+
row_to_cluster: Dict[str, int] = {}
|
| 90 |
+
for i, cluster in enumerate(clusters):
|
| 91 |
+
for rid in cluster:
|
| 92 |
+
row_to_cluster[rid] = i
|
| 93 |
+
|
| 94 |
+
annotated: List[NormalizedRecord] = []
|
| 95 |
+
for r in records:
|
| 96 |
+
cluster_id = row_to_cluster.get(r.row_id)
|
| 97 |
+
if cluster_id is not None:
|
| 98 |
+
extra = dict(r.extra)
|
| 99 |
+
extra["correlation_cluster"] = cluster_id
|
| 100 |
+
annotated.append(r.model_copy(update={"extra": extra}))
|
| 101 |
+
else:
|
| 102 |
+
annotated.append(r)
|
| 103 |
+
|
| 104 |
+
correlated_count = len(row_to_cluster)
|
| 105 |
+
return EngineOutput(
|
| 106 |
+
stage="correlation",
|
| 107 |
+
status=StageStatus.SUCCESS,
|
| 108 |
+
records=annotated,
|
| 109 |
+
summary=f"Found {len(clusters)} clusters linking {correlated_count} records",
|
| 110 |
+
metadata={
|
| 111 |
+
"cluster_count": len(clusters),
|
| 112 |
+
"correlated_records": correlated_count,
|
| 113 |
+
"entity_index_size": len(index),
|
| 114 |
+
},
|
| 115 |
+
)
|
| 116 |
+
except Exception as exc:
|
| 117 |
+
logger.error("Correlation failed: %s", exc, exc_info=True)
|
| 118 |
+
return EngineOutput(
|
| 119 |
+
stage="correlation",
|
| 120 |
+
status=StageStatus.FAILED,
|
| 121 |
+
error=str(exc),
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
print("Correlation module β use via engine pipeline")
|
modules/correlation/correlate_ioc.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
from schemas.osint_module_io import OSINTModuleInput, OSINTModuleOutput
|
| 4 |
+
|
| 5 |
+
def run(input_path: str) -> OSINTModuleOutput:
|
| 6 |
+
"""
|
| 7 |
+
Correlation phase:
|
| 8 |
+
- Loads multiple module outputs if needed
|
| 9 |
+
- Cross-references IOCs/IOAs (MISP, Shodan, honeypots)
|
| 10 |
+
- Clusters related events/patterns
|
| 11 |
+
"""
|
| 12 |
+
cfg = OSINTModuleInput.parse_file(input_path)
|
| 13 |
+
summary = "Correlation complete"
|
| 14 |
+
indicators = {}
|
| 15 |
+
confidence = 0.0
|
| 16 |
+
return OSINTModuleOutput(summary=summary, indicators=indicators, confidence=confidence)
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
parser = argparse.ArgumentParser(description="Run the Correlation module")
|
| 20 |
+
parser.add_argument("input", help="Path to OSINTModuleInput JSON")
|
| 21 |
+
args = parser.parse_args()
|
| 22 |
+
out = run(args.input)
|
| 23 |
+
print(out.json())
|
modules/export/export_results.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Export module β writes final artifacts (JSON summary, per-stage outputs).
|
| 3 |
+
|
| 4 |
+
Engine contract:
|
| 5 |
+
run(EngineInput) -> EngineOutput
|
| 6 |
+
|
| 7 |
+
The main report generation (HTML, CSV, JSONL) is handled by
|
| 8 |
+
``engine.reporting``. This module produces a supplementary JSON
|
| 9 |
+
summary artifact in the run directory.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any, Dict
|
| 18 |
+
|
| 19 |
+
from engine.io_contract import (
|
| 20 |
+
Artifact,
|
| 21 |
+
EngineInput,
|
| 22 |
+
EngineOutput,
|
| 23 |
+
StageStatus,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("modules.export")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 30 |
+
"""
|
| 31 |
+
Export a JSON summary of the pipeline run.
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
run_dir = Path(engine_input.run_dir)
|
| 35 |
+
exports_dir = run_dir / "exports"
|
| 36 |
+
exports_dir.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
|
| 38 |
+
# Build summary
|
| 39 |
+
summary_data = {
|
| 40 |
+
"run_id": engine_input.run_id,
|
| 41 |
+
"total_records": len(engine_input.records),
|
| 42 |
+
"input_files": [
|
| 43 |
+
{
|
| 44 |
+
"name": f.path.name,
|
| 45 |
+
"type": f.file_type.value,
|
| 46 |
+
"size": f.size_bytes,
|
| 47 |
+
}
|
| 48 |
+
for f in engine_input.input_spec.files
|
| 49 |
+
],
|
| 50 |
+
"record_sample": [
|
| 51 |
+
{
|
| 52 |
+
"row_id": r.row_id,
|
| 53 |
+
"source": r.source_file,
|
| 54 |
+
"entity_name": r.entity_name,
|
| 55 |
+
"entity_email": r.entity_email,
|
| 56 |
+
"entity_ip": r.entity_ip,
|
| 57 |
+
}
|
| 58 |
+
for r in engine_input.records[:20]
|
| 59 |
+
],
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
out_path = exports_dir / "summary.json"
|
| 63 |
+
out_path.write_text(
|
| 64 |
+
json.dumps(summary_data, indent=2, ensure_ascii=False, default=str),
|
| 65 |
+
encoding="utf-8",
|
| 66 |
+
)
|
| 67 |
+
logger.info("Exported summary to %s", out_path)
|
| 68 |
+
|
| 69 |
+
return EngineOutput(
|
| 70 |
+
stage="export",
|
| 71 |
+
status=StageStatus.SUCCESS,
|
| 72 |
+
records=engine_input.records, # pass through
|
| 73 |
+
artifacts=[
|
| 74 |
+
Artifact(
|
| 75 |
+
name="summary.json",
|
| 76 |
+
path=out_path,
|
| 77 |
+
mime_type="application/json",
|
| 78 |
+
description="Pipeline run summary",
|
| 79 |
+
)
|
| 80 |
+
],
|
| 81 |
+
summary=f"Exported summary.json ({len(engine_input.records)} records)",
|
| 82 |
+
)
|
| 83 |
+
except Exception as exc:
|
| 84 |
+
logger.error("Export failed: %s", exc, exc_info=True)
|
| 85 |
+
return EngineOutput(
|
| 86 |
+
stage="export",
|
| 87 |
+
status=StageStatus.FAILED,
|
| 88 |
+
error=str(exc),
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
# Legacy compatibility
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
|
| 96 |
+
def export(data: Any, outpath: str) -> str:
|
| 97 |
+
"""Legacy wrapper (deprecated). Use ``run()`` instead."""
|
| 98 |
+
with open(outpath, "w") as f:
|
| 99 |
+
f.write(str(data))
|
| 100 |
+
return outpath
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
print(export({"result": 123}, "output.txt"))
|
modules/ingestion/__init__.py
ADDED
|
File without changes
|
modules/ingestion/gathering/web_scraper.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def scrape(url):
|
| 2 |
+
# Placeholder: insert scraping logic
|
| 3 |
+
return {"url": url, "data": "scraped_content"}
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
print(scrape("https://example.com"))
|
modules/ingestion/ingest_data.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingestion module β reads input files and produces NormalizedRecords.
|
| 3 |
+
|
| 4 |
+
Engine contract:
|
| 5 |
+
run(EngineInput) -> EngineOutput
|
| 6 |
+
|
| 7 |
+
The heavy lifting (file parsing, entity extraction) is delegated to
|
| 8 |
+
``engine.normalize``. This module acts as the pipeline-stage wrapper.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
from typing import Any, Dict
|
| 15 |
+
|
| 16 |
+
from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
|
| 17 |
+
from engine.normalize import normalize_files
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger("modules.ingestion")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 23 |
+
"""
|
| 24 |
+
Ingest files described by ``engine_input.input_spec`` and return
|
| 25 |
+
normalized records.
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
records = normalize_files(engine_input.input_spec)
|
| 29 |
+
return EngineOutput(
|
| 30 |
+
stage="ingestion",
|
| 31 |
+
status=StageStatus.SUCCESS,
|
| 32 |
+
records=records,
|
| 33 |
+
summary=f"Ingested {len(records)} records from {len(engine_input.input_spec.files)} files",
|
| 34 |
+
)
|
| 35 |
+
except Exception as exc:
|
| 36 |
+
logger.error("Ingestion failed: %s", exc, exc_info=True)
|
| 37 |
+
return EngineOutput(
|
| 38 |
+
stage="ingestion",
|
| 39 |
+
status=StageStatus.FAILED,
|
| 40 |
+
error=str(exc),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# Legacy compatibility β keep old function signature working
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
|
| 48 |
+
def ingest(file_path: str) -> Dict[str, Any]:
|
| 49 |
+
"""Legacy wrapper (deprecated). Use ``run()`` instead."""
|
| 50 |
+
return {"file": file_path, "status": "ingested"}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
print(ingest("input.txt"))
|
modules/ml_analysis/ml_analysis.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ML Analysis module β entity extraction and text classification.
|
| 3 |
+
|
| 4 |
+
Engine contract:
|
| 5 |
+
run(EngineInput) -> EngineOutput
|
| 6 |
+
|
| 7 |
+
This is a lightweight placeholder that performs regex-based entity
|
| 8 |
+
extraction. When ML dependencies (torch, transformers, spacy) are
|
| 9 |
+
available, it can be extended to use real models.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import logging
|
| 15 |
+
import re
|
| 16 |
+
from typing import Any, Dict, List, Optional
|
| 17 |
+
|
| 18 |
+
from engine.io_contract import (
|
| 19 |
+
EngineInput,
|
| 20 |
+
EngineOutput,
|
| 21 |
+
NormalizedRecord,
|
| 22 |
+
StageStatus,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger("modules.ml_analysis")
|
| 26 |
+
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
# Lightweight entity extraction (no ML deps required)
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
_EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
|
| 32 |
+
_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
|
| 33 |
+
_PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
|
| 34 |
+
_DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
|
| 35 |
+
_HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _enrich_record(record: NormalizedRecord) -> NormalizedRecord:
|
| 39 |
+
"""
|
| 40 |
+
Enrich a record by extracting entities from raw_text if not already set.
|
| 41 |
+
"""
|
| 42 |
+
text = record.raw_text
|
| 43 |
+
updates: Dict[str, Any] = {}
|
| 44 |
+
|
| 45 |
+
if not record.entity_email:
|
| 46 |
+
m = _EMAIL_RE.search(text)
|
| 47 |
+
if m:
|
| 48 |
+
updates["entity_email"] = m.group(0).lower()
|
| 49 |
+
|
| 50 |
+
if not record.entity_ip:
|
| 51 |
+
m = _IP_RE.search(text)
|
| 52 |
+
if m:
|
| 53 |
+
updates["entity_ip"] = m.group(0)
|
| 54 |
+
|
| 55 |
+
if not record.entity_phone:
|
| 56 |
+
m = _PHONE_RE.search(text)
|
| 57 |
+
if m:
|
| 58 |
+
updates["entity_phone"] = m.group(0)
|
| 59 |
+
|
| 60 |
+
if not record.entity_domain:
|
| 61 |
+
m = _DOMAIN_RE.search(text)
|
| 62 |
+
if m:
|
| 63 |
+
updates["entity_domain"] = m.group(0).lower()
|
| 64 |
+
|
| 65 |
+
if not record.entity_hash:
|
| 66 |
+
m = _HASH_RE.search(text)
|
| 67 |
+
if m:
|
| 68 |
+
updates["entity_hash"] = m.group(0).lower()
|
| 69 |
+
|
| 70 |
+
if updates:
|
| 71 |
+
return record.model_copy(update=updates)
|
| 72 |
+
return record
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
# Engine contract
|
| 77 |
+
# ---------------------------------------------------------------------------
|
| 78 |
+
|
| 79 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 80 |
+
"""
|
| 81 |
+
Run ML analysis / entity enrichment on all records.
|
| 82 |
+
"""
|
| 83 |
+
try:
|
| 84 |
+
enriched: List[NormalizedRecord] = []
|
| 85 |
+
enrichment_count = 0
|
| 86 |
+
|
| 87 |
+
for record in engine_input.records:
|
| 88 |
+
new_record = _enrich_record(record)
|
| 89 |
+
if new_record is not record:
|
| 90 |
+
enrichment_count += 1
|
| 91 |
+
enriched.append(new_record)
|
| 92 |
+
|
| 93 |
+
return EngineOutput(
|
| 94 |
+
stage="analysis",
|
| 95 |
+
status=StageStatus.SUCCESS,
|
| 96 |
+
records=enriched,
|
| 97 |
+
summary=f"Analyzed {len(enriched)} records, enriched {enrichment_count}",
|
| 98 |
+
metadata={"enriched_count": enrichment_count},
|
| 99 |
+
)
|
| 100 |
+
except Exception as exc:
|
| 101 |
+
logger.error("ML analysis failed: %s", exc, exc_info=True)
|
| 102 |
+
return EngineOutput(
|
| 103 |
+
stage="analysis",
|
| 104 |
+
status=StageStatus.FAILED,
|
| 105 |
+
error=str(exc),
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
# Legacy compatibility
|
| 111 |
+
# ---------------------------------------------------------------------------
|
| 112 |
+
|
| 113 |
+
def analyze(data: Any) -> Dict[str, Any]:
|
| 114 |
+
"""Legacy wrapper (deprecated). Use ``run()`` instead."""
|
| 115 |
+
return {"input": data, "prediction": "none"}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
print(analyze("test"))
|
modules/preprocessing/preprocess_data.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Preprocessing module β cleans and normalizes raw text in records.
|
| 3 |
+
|
| 4 |
+
Engine contract:
|
| 5 |
+
run(EngineInput) -> EngineOutput
|
| 6 |
+
|
| 7 |
+
Applies basic text cleaning to each record's ``raw_text`` field:
|
| 8 |
+
strip whitespace, normalize unicode, collapse whitespace runs.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import re
|
| 15 |
+
import unicodedata
|
| 16 |
+
from typing import Any, Dict, List
|
| 17 |
+
|
| 18 |
+
from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger("modules.preprocessing")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _clean_text(text: str) -> str:
|
| 24 |
+
"""Basic text normalization."""
|
| 25 |
+
# Unicode NFKC normalization
|
| 26 |
+
text = unicodedata.normalize("NFKC", text)
|
| 27 |
+
# Strip leading/trailing whitespace
|
| 28 |
+
text = text.strip()
|
| 29 |
+
# Collapse multiple whitespace to single space
|
| 30 |
+
text = re.sub(r"\s+", " ", text)
|
| 31 |
+
return text
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def run(engine_input: EngineInput) -> EngineOutput:
|
| 35 |
+
"""
|
| 36 |
+
Preprocess all records: clean ``raw_text``, normalize entity fields.
|
| 37 |
+
"""
|
| 38 |
+
try:
|
| 39 |
+
cleaned: List[NormalizedRecord] = []
|
| 40 |
+
for record in engine_input.records:
|
| 41 |
+
# Create a copy with cleaned text
|
| 42 |
+
updated = record.model_copy(update={
|
| 43 |
+
"raw_text": _clean_text(record.raw_text),
|
| 44 |
+
"entity_name": record.entity_name.strip() if record.entity_name else None,
|
| 45 |
+
"entity_email": record.entity_email.strip().lower() if record.entity_email else None,
|
| 46 |
+
"entity_domain": record.entity_domain.strip().lower() if record.entity_domain else None,
|
| 47 |
+
})
|
| 48 |
+
cleaned.append(updated)
|
| 49 |
+
|
| 50 |
+
return EngineOutput(
|
| 51 |
+
stage="preprocessing",
|
| 52 |
+
status=StageStatus.SUCCESS,
|
| 53 |
+
records=cleaned,
|
| 54 |
+
summary=f"Preprocessed {len(cleaned)} records",
|
| 55 |
+
)
|
| 56 |
+
except Exception as exc:
|
| 57 |
+
logger.error("Preprocessing failed: %s", exc, exc_info=True)
|
| 58 |
+
return EngineOutput(
|
| 59 |
+
stage="preprocessing",
|
| 60 |
+
status=StageStatus.FAILED,
|
| 61 |
+
error=str(exc),
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Legacy compatibility
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def preprocess(text: str) -> str:
|
| 70 |
+
"""Legacy wrapper (deprecated). Use ``run()`` instead."""
|
| 71 |
+
return text.strip().lower()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
print(preprocess(" This is RAW DATA. "))
|
pipeline_config.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ai:
|
| 2 |
+
mode: offline
|
| 3 |
+
provider: local_gguf
|
| 4 |
+
fallbacks:
|
| 5 |
+
- local_gguf
|
| 6 |
+
- torch_tfidf
|
| 7 |
+
gguf_path: models/gguf/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
|
| 8 |
+
n_ctx: 2048
|
| 9 |
+
n_threads: 8
|
| 10 |
+
n_gpu_layers: 0
|
| 11 |
+
temperature: 0.2
|
| 12 |
+
top_p: 0.95
|
| 13 |
+
max_tokens: 256
|
| 14 |
+
remote:
|
| 15 |
+
enabled: false
|
requirements-hf.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MOD-OSINT β Hugging Face Docker Space runtime requirements
|
| 2 |
+
# Intentionally lean: no torch, no llama-cpp, no heavy ML.
|
| 3 |
+
# ML analysis is toggled OFF by default in the GUI.
|
| 4 |
+
|
| 5 |
+
# GUI
|
| 6 |
+
streamlit>=1.35.0
|
| 7 |
+
|
| 8 |
+
# Data handling
|
| 9 |
+
pandas>=2.0.0
|
| 10 |
+
pyyaml>=6.0.2
|
| 11 |
+
|
| 12 |
+
# Engine core
|
| 13 |
+
pydantic>=2.10.0
|
| 14 |
+
jinja2>=3.1.4
|
| 15 |
+
|
| 16 |
+
# Security
|
| 17 |
+
certifi>=2024.12.14
|
samples/demo_ingest/example.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name,phone
|
| 2 |
+
John,8105551212
|
samples/demo_ingest/example.html
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<html><body>demo html</body></html>
|
samples/demo_ingest/example.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"note":"demo"}
|
samples/demo_ingest/example.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
hello beta
|
scripts/docker_entrypoint.sh
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# MOD-OSINT Docker entrypoint β startup self-check + Streamlit launch
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
PORT="${STREAMLIT_PORT:-7860}"
|
| 6 |
+
HOST="${STREAMLIT_HOST:-0.0.0.0}"
|
| 7 |
+
|
| 8 |
+
if ! [[ "$PORT" =~ ^[0-9]+$ ]] || [ "$PORT" -lt 1 ] || [ "$PORT" -gt 65535 ]; then
|
| 9 |
+
echo "[fatal] Invalid STREAMLIT_PORT: ${PORT}" >&2
|
| 10 |
+
exit 1
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
required_files=(
|
| 14 |
+
"gui/streamlit_app.py"
|
| 15 |
+
"engine/pipeline_orchestrator.py"
|
| 16 |
+
"requirements-hf.txt"
|
| 17 |
+
)
|
| 18 |
+
for file_path in "${required_files[@]}"; do
|
| 19 |
+
if [ ! -f "$file_path" ]; then
|
| 20 |
+
echo "[fatal] Missing required file: $file_path" >&2
|
| 21 |
+
exit 1
|
| 22 |
+
fi
|
| 23 |
+
done
|
| 24 |
+
|
| 25 |
+
for dir_path in runs logs; do
|
| 26 |
+
mkdir -p "$dir_path"
|
| 27 |
+
if [ ! -w "$dir_path" ]; then
|
| 28 |
+
echo "[fatal] Directory not writable: $dir_path" >&2
|
| 29 |
+
exit 1
|
| 30 |
+
fi
|
| 31 |
+
done
|
| 32 |
+
|
| 33 |
+
python3 - <<'PY'
|
| 34 |
+
import importlib
|
| 35 |
+
import sys
|
| 36 |
+
|
| 37 |
+
print(f"[startup] Python: {sys.version.split()[0]}")
|
| 38 |
+
for mod in ("streamlit", "pandas", "yaml"):
|
| 39 |
+
importlib.import_module(mod)
|
| 40 |
+
print("[startup] Dependency import check: OK")
|
| 41 |
+
PY
|
| 42 |
+
|
| 43 |
+
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
|
| 44 |
+
echo " MOD-OSINT | Streamlit GUI"
|
| 45 |
+
echo " Binding: ${HOST}:${PORT}"
|
| 46 |
+
echo " Health URL: http://localhost:${PORT}/_stcore/health"
|
| 47 |
+
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
|
| 48 |
+
|
| 49 |
+
if [ "${MODOSINT_STARTUP_CHECK_ONLY:-0}" = "1" ]; then
|
| 50 |
+
echo "[startup] MODOSINT_STARTUP_CHECK_ONLY=1 -> exiting after checks"
|
| 51 |
+
exit 0
|
| 52 |
+
fi
|
| 53 |
+
|
| 54 |
+
exec streamlit run gui/streamlit_app.py \
|
| 55 |
+
--server.address="${HOST}" \
|
| 56 |
+
--server.port="${PORT}" \
|
| 57 |
+
--server.enableCORS=false \
|
| 58 |
+
--server.enableXsrfProtection=false \
|
| 59 |
+
--server.headless=true \
|
| 60 |
+
--browser.gatherUsageStats=false
|