moddux commited on
Commit
b75c637
Β·
0 Parent(s):

deploy: HF sanitized GUI snapshot

Browse files
.dockerignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Version control
2
+ .git/
3
+ .gitignore
4
+
5
+ # Python caches
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+ .eggs/
14
+
15
+ # Virtual environments
16
+ .venv/
17
+ venv/
18
+ env/
19
+
20
+ # Runtime outputs β€” never bake into image
21
+ runs/
22
+ artifacts/
23
+ logs/
24
+ *.log
25
+ *.sqlite
26
+ *.db
27
+
28
+ # Secrets / env
29
+ *.env
30
+ .env
31
+ .env.*
32
+ Vault/
33
+
34
+ # macOS
35
+ .DS_Store
36
+ .AppleDouble
37
+
38
+ # CI / audit artefacts
39
+ .branch_audit/
40
+ *.csv
41
+ *_output.json
42
+ safety_report.json
43
+
44
+ # Heavy model files
45
+ models/gguf/
46
+ ml_models/
47
+ *.gguf
48
+ *.bin
49
+ *.pt
50
+ *.pth
51
+
52
+ # Dev-only docs / notebooks
53
+ *.ipynb
54
+ *.odt
55
+ *.tex
56
+ *.pdf
57
+
58
+ # Legacy dashboard (not served by container)
59
+ dashboard/
.github/workflows/hf_sync.yml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to HF Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - beta/sanitized-minimal
7
+ workflow_dispatch:
8
+ inputs:
9
+ ref:
10
+ description: "Branch or SHA to deploy (default: beta/sanitized-minimal)"
11
+ required: false
12
+ default: "beta/sanitized-minimal"
13
+
14
+ concurrency:
15
+ group: hf-space-sync
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ hf-sync:
20
+ runs-on: ubuntu-latest
21
+ timeout-minutes: 15
22
+
23
+ steps:
24
+ - name: Checkout deployment ref
25
+ uses: actions/checkout@v4
26
+ with:
27
+ fetch-depth: 0
28
+ lfs: true
29
+ ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.ref }}
30
+
31
+ - name: Resolve checked out ref
32
+ id: ref
33
+ run: |
34
+ BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD)
35
+ if [ "$BRANCH_NAME" = "HEAD" ]; then
36
+ BRANCH_NAME=$(git name-rev --name-only --exclude=tags/* HEAD | sed 's#^remotes/origin/##')
37
+ fi
38
+ echo "branch_name=$BRANCH_NAME" >> "$GITHUB_OUTPUT"
39
+ echo "Deploying ref: $BRANCH_NAME @ $(git rev-parse --short HEAD)"
40
+
41
+ - name: Enforce sanitized deployment policy
42
+ run: |
43
+ set -euo pipefail
44
+
45
+ echo "Checking required runtime files..."
46
+ required_files=(
47
+ "README.md"
48
+ "Dockerfile"
49
+ "requirements-hf.txt"
50
+ "gui/streamlit_app.py"
51
+ "engine/pipeline_orchestrator.py"
52
+ "scripts/docker_entrypoint.sh"
53
+ )
54
+ for f in "${required_files[@]}"; do
55
+ [ -f "$f" ] || { echo "::error::Missing required file: $f"; exit 1; }
56
+ done
57
+
58
+ echo "Checking blocked paths/files..."
59
+ blocked_paths=(
60
+ "Docs"
61
+ "Docs/audit"
62
+ "Vault"
63
+ "logs"
64
+ "runs"
65
+ ".env"
66
+ "TODO.md"
67
+ "docs/progress.md"
68
+ )
69
+ violations=0
70
+ for p in "${blocked_paths[@]}"; do
71
+ if [ -e "$p" ]; then
72
+ echo "::error::Blocked path present in deployment ref: $p"
73
+ violations=1
74
+ fi
75
+ done
76
+ [ "$violations" -eq 0 ]
77
+
78
+ echo "Checking for obvious secret patterns..."
79
+ if command -v rg >/dev/null 2>&1; then
80
+ rg -n --hidden \
81
+ -g '!.git/**' \
82
+ -g '!**/*.md' \
83
+ -e 'AKIA[0-9A-Z]{16}' \
84
+ -e 'ASIA[0-9A-Z]{16}' \
85
+ -e 'ghp_[A-Za-z0-9]{36}' \
86
+ -e 'hf_[A-Za-z0-9]{30,}' \
87
+ -e 'sk_live_[0-9A-Za-z]{20,}' \
88
+ -e '-----BEGIN (RSA|EC|OPENSSH|DSA) PRIVATE KEY-----' \
89
+ -e 'xox[baprs]-[A-Za-z0-9-]{10,}' \
90
+ . && { echo "::error::Potential secret detected"; exit 1; } || true
91
+ fi
92
+
93
+ - name: Check large files (>10 MB)
94
+ run: |
95
+ set -euo pipefail
96
+ large_files=$(find . -not -path './.git/*' -type f -size +10M 2>/dev/null || true)
97
+ if [ -n "$large_files" ]; then
98
+ echo "::warning::Large files detected (>10MB):"
99
+ echo "$large_files"
100
+ echo "Consider Git LFS for required large assets."
101
+ else
102
+ echo "No large files >10MB found."
103
+ fi
104
+
105
+ - name: Push to HF Space
106
+ env:
107
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
108
+ run: |
109
+ set -euo pipefail
110
+
111
+ if [ -z "${HF_TOKEN:-}" ]; then
112
+ echo "::error::HF_TOKEN secret is not set. Configure it in GitHub Actions secrets."
113
+ exit 1
114
+ fi
115
+
116
+ git config user.email "ci@github.com"
117
+ git config user.name "CI Bot"
118
+
119
+ git remote remove hf 2>/dev/null || true
120
+ git remote add hf "https://moddux:${HF_TOKEN}@huggingface.co/spaces/moddux/mod-osint"
121
+
122
+ echo "Pushing $(git rev-parse --short HEAD) from '${{ steps.ref.outputs.branch_name }}' -> HF main"
123
+ git push hf HEAD:main --force-with-lease
124
+
125
+ echo "HF deploy push complete."
126
+ echo "Logs: https://huggingface.co/spaces/moddux/mod-osint/logs"
.gitignore ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===== Python =====
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ *.whl
11
+
12
+ # ===== Virtual environments =====
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # ===== IDE / OS =====
18
+ .git/
19
+ .DS_Store
20
+ ._*
21
+ .idea/
22
+ .vscode/
23
+ *.swp
24
+ *.swo
25
+ *~
26
+
27
+ # ===== Environment / secrets =====
28
+ *.env
29
+ .env.*
30
+
31
+ # ===== Logs =====
32
+ *.log
33
+ logs/
34
+ runner/logs/
35
+
36
+ # ===== Databases =====
37
+ *.sqlite
38
+ *.sqlite3
39
+ *.db
40
+
41
+ # ===== Run artifacts (engine output) =====
42
+ runs/
43
+ output/
44
+ *_output.json
45
+ *_output.csv
46
+
47
+ # ===== Legacy generated artifacts =====
48
+ artifacts/*.json
49
+ artifacts/*.csv
50
+ artifacts/*.html
51
+ artifacts/*.svg
52
+ artifacts/*.pdf
53
+ NETLOG*
54
+ DNSESS*
55
+ graph.cypher
56
+
57
+ # ===== Data directory (user-supplied, not tracked) =====
58
+ Data/
59
+
60
+ # ===== ML models (large binaries) =====
61
+ venv/models/gguf/*.gguf
62
+ models/
63
+ *.gguf
64
+ *.bin
65
+ *.safetensors
66
+
67
+ # ===== Nested repo (do not track) =====
68
+ MOD_OSINT/
69
+
70
+ # ===== Legacy uppercase Modules (quarantined, kept on disk) =====
71
+ # NOTE: On case-insensitive filesystems (macOS), we cannot gitignore
72
+ # "Modules/" without also blocking "modules/". The uppercase files
73
+ # were removed from the index in commit 1284d1a. We rely on the
74
+ # index removal rather than .gitignore for Modules/.
75
+
76
+ # ===== Misc generated =====
77
+ pipeline_status.json
78
+ *.zip
79
+ *.tar.gz
80
+ htmlcov/
81
+ .coverage
82
+ .pytest_cache/
83
+ .mypy_cache/
84
+ .ruff_cache/
Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MOD-OSINT β€” Hugging Face Docker Space
2
+ # Runs the GUI wizard wired to engine/pipeline_orchestrator.py on port 7860.
3
+ FROM python:3.11-slim
4
+
5
+ ARG BUILD_DATE="unknown"
6
+ ARG VCS_REF="unknown"
7
+
8
+ LABEL maintainer="moddux" \
9
+ org.opencontainers.image.title="MOD-OSINT" \
10
+ org.opencontainers.image.description="MOD-OSINT Streamlit GUI for HF Docker Space" \
11
+ org.opencontainers.image.source="https://github.com/moddux/MOD-OSINT" \
12
+ org.opencontainers.image.created="${BUILD_DATE}" \
13
+ org.opencontainers.image.revision="${VCS_REF}"
14
+
15
+ ENV PYTHONDONTWRITEBYTECODE=1 \
16
+ PYTHONUNBUFFERED=1 \
17
+ PIP_NO_CACHE_DIR=1 \
18
+ STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
19
+ STREAMLIT_SERVER_HEADLESS=true \
20
+ STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
21
+ STREAMLIT_SERVER_PORT=7860
22
+
23
+ RUN apt-get update \
24
+ && apt-get install -y --no-install-recommends \
25
+ ca-certificates \
26
+ git \
27
+ curl \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ WORKDIR /app
31
+
32
+ # Install runtime dependencies first for cache reuse.
33
+ COPY requirements-hf.txt ./requirements-hf.txt
34
+ RUN python -m pip install --upgrade pip \
35
+ && pip install --no-cache-dir -r requirements-hf.txt
36
+
37
+ # Create runtime user before copying app files.
38
+ RUN useradd -m -u 1000 appuser
39
+
40
+ # Copy application source (honors .dockerignore).
41
+ COPY --chown=appuser:appuser . .
42
+
43
+ # Runtime dirs and entrypoint permissions.
44
+ RUN mkdir -p /app/runs /app/logs \
45
+ && chown -R appuser:appuser /app/runs /app/logs \
46
+ && chmod 775 /app/runs /app/logs \
47
+ && chmod +x /app/scripts/docker_entrypoint.sh
48
+
49
+ USER appuser
50
+
51
+ EXPOSE 7860
52
+
53
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=5 \
54
+ CMD curl -fsS http://127.0.0.1:7860/_stcore/health || exit 1
55
+
56
+ ENTRYPOINT ["bash", "scripts/docker_entrypoint.sh"]
LICENSE ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy...
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: mod-osint
3
+ emoji: "🧠"
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # MOD-OSINT (Beta Sanitized Build)
12
+
13
+ This branch is the sanitized beta deployment bundle for:
14
+
15
+ - Hugging Face Space: https://huggingface.co/spaces/moddux/mod-osint
16
+
17
+ What is included:
18
+
19
+ - Runtime engine (`engine/`)
20
+ - Runtime modules (`modules/`)
21
+ - Streamlit GUI (`gui/`)
22
+ - Docker runtime files (`Dockerfile`, `requirements-hf.txt`, `scripts/docker_entrypoint.sh`)
23
+
24
+ What is excluded:
25
+
26
+ - Audit logs and runtime logs
27
+ - Development TODO/progress notes
28
+ - Internal vault/secrets files
29
+ - Non-runtime integrations and large artifacts
30
+
31
+ ## Local Run
32
+
33
+ ```bash
34
+ docker build -t mod-osint-beta .
35
+ docker run --rm -p 7860:7860 mod-osint-beta
36
+ ```
37
+
38
+ Then open: http://localhost:7860
engine/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MOD-OSINT Engine β€” centralized pipeline orchestration.
3
+
4
+ Submodules:
5
+ io_contract – Pydantic v2 typed IO schemas
6
+ registry – module registration + dependency graph
7
+ normalize – canonical schema + entity linking
8
+ storage – SQL storage (SQLite / Postgres)
9
+ reporting – HTML report + JSONL/CSV exports
10
+ pipeline_orchestrator – single entrypoint
11
+ """
12
+
13
+ __version__ = "0.1.0"
engine/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Allow ``python -m engine`` to run the pipeline orchestrator."""
2
+
3
+ from engine.pipeline_orchestrator import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
engine/io_contract.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ IO Contract β€” typed schemas for the MOD-OSINT engine.
3
+
4
+ Every pipeline module must accept ``EngineInput`` and return ``EngineOutput``.
5
+ The engine orchestrator builds ``RunContext`` which carries paths and DB handles.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import uuid
11
+ from datetime import datetime, timezone
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Enums
21
+ # ---------------------------------------------------------------------------
22
+
23
+ class FileType(str, Enum):
24
+ CSV = "csv"
25
+ JSON = "json"
26
+ TXT = "txt"
27
+ HTML = "html"
28
+ LOG = "log"
29
+ UNKNOWN = "unknown"
30
+
31
+
32
+ class StageStatus(str, Enum):
33
+ PENDING = "pending"
34
+ RUNNING = "running"
35
+ SUCCESS = "success"
36
+ FAILED = "failed"
37
+ SKIPPED = "skipped"
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Input specs
42
+ # ---------------------------------------------------------------------------
43
+
44
+ class InputFile(BaseModel):
45
+ """Descriptor for a single ingested file."""
46
+ path: Path
47
+ file_type: FileType = FileType.UNKNOWN
48
+ size_bytes: int = 0
49
+ sha256: str = ""
50
+
51
+
52
+ class InputSpec(BaseModel):
53
+ """Describes the full set of input data for a pipeline run."""
54
+ input_dir: Path
55
+ files: List[InputFile] = Field(default_factory=list)
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Normalized records
60
+ # ---------------------------------------------------------------------------
61
+
62
+ class NormalizedRecord(BaseModel):
63
+ """
64
+ Canonical record produced by the normalization stage.
65
+
66
+ Every record gets a deterministic ``row_id`` and optional entity-linking
67
+ keys so downstream modules can join/correlate across sources.
68
+ """
69
+ row_id: str = Field(default_factory=lambda: uuid.uuid4().hex[:12])
70
+ source_file: str = ""
71
+ source_type: FileType = FileType.UNKNOWN
72
+ timestamp: Optional[datetime] = None
73
+ entity_name: Optional[str] = None
74
+ entity_phone: Optional[str] = None
75
+ entity_email: Optional[str] = None
76
+ entity_ip: Optional[str] = None
77
+ entity_domain: Optional[str] = None
78
+ entity_hash: Optional[str] = None
79
+ raw_text: str = ""
80
+ extra: Dict[str, Any] = Field(default_factory=dict)
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Artifacts
85
+ # ---------------------------------------------------------------------------
86
+
87
+ class Artifact(BaseModel):
88
+ """A single output artifact produced by a module."""
89
+ name: str
90
+ path: Path
91
+ mime_type: str = "application/octet-stream"
92
+ description: str = ""
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Engine IO (module contract)
97
+ # ---------------------------------------------------------------------------
98
+
99
+ class EngineInput(BaseModel):
100
+ """
101
+ Standard input passed to every pipeline module's ``run()`` function.
102
+
103
+ Modules receive the normalized records from prior stages plus the
104
+ run context (paths, config).
105
+ """
106
+ run_id: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6])
107
+ input_spec: InputSpec
108
+ records: List[NormalizedRecord] = Field(default_factory=list)
109
+ config: Dict[str, Any] = Field(default_factory=dict)
110
+ run_dir: Path = Path("runs/default")
111
+ previous_artifacts: List[Artifact] = Field(default_factory=list)
112
+
113
+
114
+ class EngineOutput(BaseModel):
115
+ """
116
+ Standard output returned by every pipeline module's ``run()`` function.
117
+ """
118
+ stage: str
119
+ status: StageStatus = StageStatus.SUCCESS
120
+ records: List[NormalizedRecord] = Field(default_factory=list)
121
+ artifacts: List[Artifact] = Field(default_factory=list)
122
+ summary: str = ""
123
+ error: Optional[str] = None
124
+ metadata: Dict[str, Any] = Field(default_factory=dict)
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # Run context (internal, built by orchestrator)
129
+ # ---------------------------------------------------------------------------
130
+
131
+ class RunContext(BaseModel):
132
+ """
133
+ Internal context object built by the orchestrator for a single run.
134
+ Carries the run directory, DB path, and accumulated state.
135
+ """
136
+ run_id: str
137
+ run_dir: Path
138
+ db_path: Path
139
+ input_spec: InputSpec
140
+ config: Dict[str, Any] = Field(default_factory=dict)
141
+ stage_results: Dict[str, EngineOutput] = Field(default_factory=dict)
142
+
143
+ class Config:
144
+ arbitrary_types_allowed = True
engine/normalize.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Normalize β€” canonical schema builder + entity linking keys.
3
+
4
+ Reads raw files from the input directory, detects file types, parses
5
+ them into ``NormalizedRecord`` instances with deterministic row IDs
6
+ and entity-linking fields.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import csv
12
+ import hashlib
13
+ import io
14
+ import json
15
+ import logging
16
+ import re
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ from engine.io_contract import FileType, InputFile, InputSpec, NormalizedRecord
21
+
22
+ logger = logging.getLogger("engine.normalize")
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # File-type detection
26
+ # ---------------------------------------------------------------------------
27
+
28
+ _EXT_MAP: Dict[str, FileType] = {
29
+ ".csv": FileType.CSV,
30
+ ".json": FileType.JSON,
31
+ ".txt": FileType.TXT,
32
+ ".html": FileType.HTML,
33
+ ".htm": FileType.HTML,
34
+ ".log": FileType.LOG,
35
+ }
36
+
37
+
38
+ def detect_file_type(path: Path) -> FileType:
39
+ """Detect file type from extension."""
40
+ return _EXT_MAP.get(path.suffix.lower(), FileType.UNKNOWN)
41
+
42
+
43
+ def _file_sha256(path: Path) -> str:
44
+ """Compute SHA-256 hex digest of a file."""
45
+ h = hashlib.sha256()
46
+ with open(path, "rb") as f:
47
+ for chunk in iter(lambda: f.read(8192), b""):
48
+ h.update(chunk)
49
+ return h.hexdigest()
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Build InputSpec from a directory
54
+ # ---------------------------------------------------------------------------
55
+
56
+ def build_input_spec(input_dir: Path) -> InputSpec:
57
+ """
58
+ Scan *input_dir* for supported files and return an ``InputSpec``.
59
+ """
60
+ input_dir = Path(input_dir)
61
+ files: List[InputFile] = []
62
+ if input_dir.is_file():
63
+ # Single file mode
64
+ ft = detect_file_type(input_dir)
65
+ files.append(InputFile(
66
+ path=input_dir,
67
+ file_type=ft,
68
+ size_bytes=input_dir.stat().st_size,
69
+ sha256=_file_sha256(input_dir),
70
+ ))
71
+ input_dir = input_dir.parent
72
+ else:
73
+ for p in sorted(input_dir.iterdir()):
74
+ if p.is_file() and not p.name.startswith("."):
75
+ ft = detect_file_type(p)
76
+ files.append(InputFile(
77
+ path=p,
78
+ file_type=ft,
79
+ size_bytes=p.stat().st_size,
80
+ sha256=_file_sha256(p),
81
+ ))
82
+ logger.info("InputSpec: %d files from %s", len(files), input_dir)
83
+ return InputSpec(input_dir=input_dir, files=files)
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Deterministic row ID
88
+ # ---------------------------------------------------------------------------
89
+
90
+ def _make_row_id(source_file: str, index: int, content_hash: str) -> str:
91
+ """
92
+ Deterministic row ID = first 12 hex chars of SHA-256(source_file + index + content).
93
+ """
94
+ raw = f"{source_file}:{index}:{content_hash}"
95
+ return hashlib.sha256(raw.encode()).hexdigest()[:12]
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Entity extraction helpers (lightweight, no ML)
100
+ # ---------------------------------------------------------------------------
101
+
102
+ _EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
103
+ _IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
104
+ _PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
105
+ _DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
106
+ _HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
107
+
108
+
109
+ def _extract_entities(text: str) -> Dict[str, Optional[str]]:
110
+ """Extract first occurrence of common entity types from text."""
111
+ email_m = _EMAIL_RE.search(text)
112
+ ip_m = _IP_RE.search(text)
113
+ phone_m = _PHONE_RE.search(text)
114
+ domain_m = _DOMAIN_RE.search(text)
115
+ hash_m = _HASH_RE.search(text)
116
+ return {
117
+ "entity_email": email_m.group(0) if email_m else None,
118
+ "entity_ip": ip_m.group(0) if ip_m else None,
119
+ "entity_phone": phone_m.group(0) if phone_m else None,
120
+ "entity_domain": domain_m.group(0) if domain_m else None,
121
+ "entity_hash": hash_m.group(0) if hash_m else None,
122
+ }
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Parsers per file type
127
+ # ---------------------------------------------------------------------------
128
+
129
+ def _parse_csv(path: Path) -> List[NormalizedRecord]:
130
+ records: List[NormalizedRecord] = []
131
+ with open(path, newline="", encoding="utf-8", errors="replace") as f:
132
+ reader = csv.DictReader(f)
133
+ for idx, row in enumerate(reader):
134
+ text = json.dumps(row, ensure_ascii=False)
135
+ content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
136
+ entities = _extract_entities(text)
137
+ # Try to pick up common column names
138
+ name = row.get("name") or row.get("Name") or row.get("entity_name")
139
+ phone = row.get("phone") or row.get("Phone") or row.get("entity_phone")
140
+ email = row.get("email") or row.get("Email") or row.get("entity_email")
141
+ records.append(NormalizedRecord(
142
+ row_id=_make_row_id(str(path), idx, content_hash),
143
+ source_file=str(path.name),
144
+ source_type=FileType.CSV,
145
+ entity_name=name or entities.get("entity_name"),
146
+ entity_phone=phone or entities.get("entity_phone"),
147
+ entity_email=email or entities.get("entity_email"),
148
+ entity_ip=entities.get("entity_ip"),
149
+ entity_domain=entities.get("entity_domain"),
150
+ entity_hash=entities.get("entity_hash"),
151
+ raw_text=text,
152
+ extra=dict(row),
153
+ ))
154
+ return records
155
+
156
+
157
+ def _parse_json(path: Path) -> List[NormalizedRecord]:
158
+ records: List[NormalizedRecord] = []
159
+ with open(path, encoding="utf-8", errors="replace") as f:
160
+ data = json.load(f)
161
+ # Handle both single object and list of objects
162
+ items = data if isinstance(data, list) else [data]
163
+ for idx, item in enumerate(items):
164
+ text = json.dumps(item, ensure_ascii=False) if isinstance(item, dict) else str(item)
165
+ content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
166
+ entities = _extract_entities(text)
167
+ extra = item if isinstance(item, dict) else {"value": item}
168
+ records.append(NormalizedRecord(
169
+ row_id=_make_row_id(str(path), idx, content_hash),
170
+ source_file=str(path.name),
171
+ source_type=FileType.JSON,
172
+ entity_name=extra.get("name") if isinstance(extra, dict) else None,
173
+ entity_email=entities.get("entity_email"),
174
+ entity_ip=entities.get("entity_ip"),
175
+ entity_phone=entities.get("entity_phone"),
176
+ entity_domain=entities.get("entity_domain"),
177
+ entity_hash=entities.get("entity_hash"),
178
+ raw_text=text,
179
+ extra=extra,
180
+ ))
181
+ return records
182
+
183
+
184
+ def _parse_text(path: Path, file_type: FileType = FileType.TXT) -> List[NormalizedRecord]:
185
+ """Parse plain text / HTML / log files β€” one record per non-empty line."""
186
+ records: List[NormalizedRecord] = []
187
+ with open(path, encoding="utf-8", errors="replace") as f:
188
+ lines = f.readlines()
189
+ for idx, line in enumerate(lines):
190
+ line = line.strip()
191
+ if not line:
192
+ continue
193
+ content_hash = hashlib.sha256(line.encode()).hexdigest()[:16]
194
+ entities = _extract_entities(line)
195
+ records.append(NormalizedRecord(
196
+ row_id=_make_row_id(str(path), idx, content_hash),
197
+ source_file=str(path.name),
198
+ source_type=file_type,
199
+ entity_email=entities.get("entity_email"),
200
+ entity_ip=entities.get("entity_ip"),
201
+ entity_phone=entities.get("entity_phone"),
202
+ entity_domain=entities.get("entity_domain"),
203
+ entity_hash=entities.get("entity_hash"),
204
+ raw_text=line,
205
+ ))
206
+ return records
207
+
208
+
209
+ # ---------------------------------------------------------------------------
210
+ # Main normalization entry point
211
+ # ---------------------------------------------------------------------------
212
+
213
+ def normalize_files(input_spec: InputSpec) -> List[NormalizedRecord]:
214
+ """
215
+ Parse all files in *input_spec* and return a flat list of
216
+ ``NormalizedRecord`` instances.
217
+ """
218
+ all_records: List[NormalizedRecord] = []
219
+ for f in input_spec.files:
220
+ try:
221
+ if f.file_type == FileType.CSV:
222
+ recs = _parse_csv(f.path)
223
+ elif f.file_type == FileType.JSON:
224
+ recs = _parse_json(f.path)
225
+ elif f.file_type in (FileType.TXT, FileType.LOG):
226
+ recs = _parse_text(f.path, f.file_type)
227
+ elif f.file_type == FileType.HTML:
228
+ recs = _parse_text(f.path, FileType.HTML)
229
+ else:
230
+ recs = _parse_text(f.path, FileType.UNKNOWN)
231
+ logger.info("Parsed %d records from %s (%s)", len(recs), f.path.name, f.file_type.value)
232
+ all_records.extend(recs)
233
+ except Exception as exc:
234
+ logger.error("Failed to parse %s: %s", f.path, exc)
235
+ logger.info("Total normalized records: %d", len(all_records))
236
+ return all_records
engine/pipeline_orchestrator.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pipeline Orchestrator β€” single entrypoint for MOD-OSINT pipeline execution.
3
+
4
+ Usage:
5
+ python -m engine.pipeline_orchestrator --input samples/demo_ingest/
6
+
7
+ Flow:
8
+ 1. Build InputSpec from --input path
9
+ 2. Normalize files β†’ NormalizedRecords
10
+ 3. Discover & register pipeline modules
11
+ 4. Run each stage in order: ingestion β†’ preprocessing β†’ analysis β†’ correlation β†’ export
12
+ 5. Store records + stage results in SQLite (or Postgres)
13
+ 6. Generate HTML report + JSONL/CSV exports
14
+ 7. Write run manifest
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import sys
23
+ import uuid
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Dict, List
27
+
28
+ from engine.io_contract import (
29
+ Artifact,
30
+ EngineInput,
31
+ EngineOutput,
32
+ InputSpec,
33
+ NormalizedRecord,
34
+ RunContext,
35
+ StageStatus,
36
+ )
37
+ from engine.normalize import build_input_spec, normalize_files
38
+ from engine.registry import discover_and_register, get_ordered_stages, get_stage
39
+ from engine.reporting import generate_report
40
+ from engine.storage import StorageBackend, create_storage
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Logging setup
44
+ # ---------------------------------------------------------------------------
45
+
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
49
+ handlers=[logging.StreamHandler(sys.stdout)],
50
+ )
51
+ logger = logging.getLogger("engine.orchestrator")
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Run directory
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def _create_run_dir(base: Path = Path("runs")) -> tuple[str, Path]:
59
+ """Create a deterministic run directory and return (run_id, run_dir)."""
60
+ run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6]
61
+ run_dir = base / run_id
62
+ run_dir.mkdir(parents=True, exist_ok=True)
63
+ return run_id, run_dir
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Core pipeline execution
68
+ # ---------------------------------------------------------------------------
69
+
70
+ def run_pipeline(
71
+ input_path: str | Path,
72
+ config: Dict | None = None,
73
+ runs_base: Path = Path("runs"),
74
+ ) -> RunContext:
75
+ """
76
+ Execute the full pipeline.
77
+
78
+ Args:
79
+ input_path: Path to input directory or single file.
80
+ config: Optional config overrides.
81
+ runs_base: Base directory for run outputs.
82
+
83
+ Returns:
84
+ RunContext with all results.
85
+ """
86
+ config = config or {}
87
+
88
+ # 1. Create run directory
89
+ run_id, run_dir = _create_run_dir(runs_base)
90
+ logger.info("═══ Pipeline run %s ═══", run_id)
91
+ logger.info("Run directory: %s", run_dir)
92
+
93
+ # 2. Build input spec
94
+ input_path = Path(input_path)
95
+ if not input_path.exists():
96
+ logger.error("Input path does not exist: %s", input_path)
97
+ sys.exit(1)
98
+ input_spec = build_input_spec(input_path)
99
+ logger.info("Input: %d files from %s", len(input_spec.files), input_spec.input_dir)
100
+
101
+ # 3. Normalize files
102
+ records = normalize_files(input_spec)
103
+ logger.info("Normalized: %d records", len(records))
104
+
105
+ # 4. Set up storage
106
+ db_path = run_dir / "db.sqlite"
107
+ storage = create_storage(db_path)
108
+
109
+ # 5. Store initial normalized records
110
+ storage.insert_records(records)
111
+
112
+ # 6. Build run context
113
+ ctx = RunContext(
114
+ run_id=run_id,
115
+ run_dir=run_dir,
116
+ db_path=db_path,
117
+ input_spec=input_spec,
118
+ config=config,
119
+ )
120
+
121
+ # 7. Discover and register modules
122
+ registered = discover_and_register()
123
+ logger.info("Registered stages: %s", registered)
124
+
125
+ # 8. Run each stage
126
+ stage_outputs: Dict[str, EngineOutput] = {}
127
+ current_records = records
128
+ all_artifacts: List[Artifact] = []
129
+
130
+ for stage_name in get_ordered_stages():
131
+ logger.info("── Stage: %s ──", stage_name)
132
+ run_fn = get_stage(stage_name)
133
+ if run_fn is None:
134
+ logger.warning("No run function for stage '%s' β€” skipping", stage_name)
135
+ stage_outputs[stage_name] = EngineOutput(
136
+ stage=stage_name,
137
+ status=StageStatus.SKIPPED,
138
+ summary="No run function found",
139
+ )
140
+ continue
141
+
142
+ # Build stage input
143
+ engine_input = EngineInput(
144
+ run_id=run_id,
145
+ input_spec=input_spec,
146
+ records=current_records,
147
+ config=config,
148
+ run_dir=run_dir,
149
+ previous_artifacts=all_artifacts,
150
+ )
151
+
152
+ try:
153
+ output = run_fn(engine_input)
154
+ if not isinstance(output, EngineOutput):
155
+ # Wrap legacy return values
156
+ output = EngineOutput(
157
+ stage=stage_name,
158
+ status=StageStatus.SUCCESS,
159
+ records=current_records,
160
+ summary=str(output) if output else "completed",
161
+ )
162
+ # Update records if the stage produced new ones
163
+ if output.records:
164
+ current_records = output.records
165
+ stage_outputs[stage_name] = output
166
+ all_artifacts.extend(output.artifacts)
167
+ logger.info(" β†’ %s: %s", stage_name, output.status.value)
168
+ if output.summary:
169
+ logger.info(" %s", output.summary)
170
+ except Exception as exc:
171
+ logger.error(" βœ— %s failed: %s", stage_name, exc, exc_info=True)
172
+ stage_outputs[stage_name] = EngineOutput(
173
+ stage=stage_name,
174
+ status=StageStatus.FAILED,
175
+ error=str(exc),
176
+ )
177
+
178
+ # Persist stage result
179
+ storage.insert_stage_result(stage_outputs[stage_name])
180
+
181
+ # 9. Store final records (may have been enriched by stages)
182
+ storage.insert_records(current_records)
183
+
184
+ # 10. Generate report
185
+ logger.info("── Generating report ──")
186
+ report_artifacts = generate_report(
187
+ run_id=run_id,
188
+ run_dir=run_dir,
189
+ input_spec=input_spec,
190
+ records=current_records,
191
+ stage_outputs=stage_outputs,
192
+ )
193
+ for a in report_artifacts:
194
+ storage.insert_artifact(a)
195
+ all_artifacts.extend(report_artifacts)
196
+
197
+ # 11. Write run manifest
198
+ manifest = {
199
+ "run_id": run_id,
200
+ "run_dir": str(run_dir),
201
+ "input_path": str(input_path),
202
+ "total_records": len(current_records),
203
+ "stages": {
204
+ name: {"status": out.status.value, "summary": out.summary}
205
+ for name, out in stage_outputs.items()
206
+ },
207
+ "artifacts": [
208
+ {"name": a.name, "path": str(a.path)}
209
+ for a in all_artifacts
210
+ ],
211
+ "completed_at": datetime.now(timezone.utc).isoformat(),
212
+ }
213
+ manifest_path = run_dir / "manifest.json"
214
+ manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
215
+ logger.info("Manifest: %s", manifest_path)
216
+
217
+ # 12. Close storage
218
+ storage.close()
219
+
220
+ # 13. Summary
221
+ logger.info("═══ Pipeline complete ═══")
222
+ logger.info(" Run ID: %s", run_id)
223
+ logger.info(" Records: %d", len(current_records))
224
+ logger.info(" DB: %s", db_path)
225
+ logger.info(" Report: %s", run_dir / "report" / "index.html")
226
+ logger.info(" Exports: %s", run_dir / "exports")
227
+
228
+ ctx.stage_results = stage_outputs
229
+ return ctx
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+ # CLI entrypoint
234
+ # ---------------------------------------------------------------------------
235
+
236
+ def main() -> None:
237
+ parser = argparse.ArgumentParser(
238
+ description="MOD-OSINT Pipeline Orchestrator",
239
+ formatter_class=argparse.RawDescriptionHelpFormatter,
240
+ epilog="""
241
+ Examples:
242
+ python -m engine.pipeline_orchestrator --input samples/demo_ingest/
243
+ python -m engine.pipeline_orchestrator --input data/case_001.csv
244
+ """,
245
+ )
246
+ parser.add_argument(
247
+ "--input", "-i",
248
+ required=True,
249
+ help="Path to input directory or single file",
250
+ )
251
+ parser.add_argument(
252
+ "--config", "-c",
253
+ default=None,
254
+ help="Path to JSON config overrides",
255
+ )
256
+ parser.add_argument(
257
+ "--runs-dir",
258
+ default="runs",
259
+ help="Base directory for run outputs (default: runs/)",
260
+ )
261
+ args = parser.parse_args()
262
+
263
+ config = {}
264
+ if args.config:
265
+ config = json.loads(Path(args.config).read_text())
266
+
267
+ run_pipeline(
268
+ input_path=args.input,
269
+ config=config,
270
+ runs_base=Path(args.runs_dir),
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ main()
engine/registry.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module Registry β€” discovery, registration, and dependency ordering.
3
+
4
+ The engine discovers pipeline modules from ``modules/`` (lowercase only).
5
+ Each module that wants to participate in the pipeline must either:
6
+
7
+ 1. Be listed in ``PIPELINE_STAGES`` (the built-in ordered list), or
8
+ 2. Register itself via the ``@pipeline_stage`` decorator.
9
+
10
+ The registry enforces that every registered module exposes a
11
+ ``run(input: EngineInput) -> EngineOutput`` callable.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import importlib
17
+ import logging
18
+ from collections import OrderedDict
19
+ from typing import Callable, Dict, List, Optional, Protocol
20
+
21
+ from engine.io_contract import EngineInput, EngineOutput
22
+
23
+ logger = logging.getLogger("engine.registry")
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Protocol that every pipeline module must satisfy
28
+ # ---------------------------------------------------------------------------
29
+
30
+ class PipelineModule(Protocol):
31
+ """Structural type for a pipeline module."""
32
+
33
+ def run(self, engine_input: EngineInput) -> EngineOutput: ...
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Built-in pipeline stage ordering
38
+ # ---------------------------------------------------------------------------
39
+
40
+ PIPELINE_STAGES: List[Dict[str, str]] = [
41
+ {"name": "ingestion", "module_path": "modules.ingestion.ingest_data"},
42
+ {"name": "preprocessing", "module_path": "modules.preprocessing.preprocess_data"},
43
+ {"name": "analysis", "module_path": "modules.ml_analysis.ml_analysis"},
44
+ {"name": "correlation", "module_path": "modules.correlation.correlate"},
45
+ {"name": "export", "module_path": "modules.export.export_results"},
46
+ ]
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Registry singleton
51
+ # ---------------------------------------------------------------------------
52
+
53
+ _registry: OrderedDict[str, Callable[[EngineInput], EngineOutput]] = OrderedDict()
54
+
55
+
56
+ def register(name: str, run_fn: Callable[[EngineInput], EngineOutput]) -> None:
57
+ """Register a module's run function under *name*."""
58
+ if name in _registry:
59
+ logger.warning("Overwriting existing registration for stage '%s'", name)
60
+ _registry[name] = run_fn
61
+ logger.info("Registered pipeline stage: %s", name)
62
+
63
+
64
+ def get_stage(name: str) -> Optional[Callable[[EngineInput], EngineOutput]]:
65
+ """Return the run function for *name*, or ``None``."""
66
+ return _registry.get(name)
67
+
68
+
69
+ def get_ordered_stages() -> List[str]:
70
+ """Return stage names in pipeline execution order."""
71
+ return list(_registry.keys())
72
+
73
+
74
+ def clear() -> None:
75
+ """Clear all registrations (useful for testing)."""
76
+ _registry.clear()
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Decorator for ad-hoc registration
81
+ # ---------------------------------------------------------------------------
82
+
83
+ def pipeline_stage(name: str):
84
+ """
85
+ Decorator to register a function as a pipeline stage::
86
+
87
+ @pipeline_stage("my_stage")
88
+ def run(engine_input: EngineInput) -> EngineOutput:
89
+ ...
90
+ """
91
+ def decorator(fn: Callable[[EngineInput], EngineOutput]):
92
+ register(name, fn)
93
+ return fn
94
+ return decorator
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Auto-discovery from PIPELINE_STAGES
99
+ # ---------------------------------------------------------------------------
100
+
101
+ def discover_and_register() -> List[str]:
102
+ """
103
+ Import each module listed in ``PIPELINE_STAGES`` and register its
104
+ ``run`` function. Returns the list of successfully registered stage names.
105
+
106
+ Only discovers from ``modules/`` (lowercase). The uppercase ``Modules/``
107
+ directory is explicitly excluded.
108
+ """
109
+ registered: List[str] = []
110
+ for stage_def in PIPELINE_STAGES:
111
+ name = stage_def["name"]
112
+ module_path = stage_def["module_path"]
113
+ try:
114
+ mod = importlib.import_module(module_path)
115
+ run_fn = getattr(mod, "run", None)
116
+ if run_fn is None:
117
+ logger.error(
118
+ "Module '%s' (%s) has no run() function β€” skipping",
119
+ name, module_path,
120
+ )
121
+ continue
122
+ register(name, run_fn)
123
+ registered.append(name)
124
+ except ImportError as exc:
125
+ logger.error(
126
+ "Failed to import module '%s' (%s): %s",
127
+ name, module_path, exc,
128
+ )
129
+ except Exception as exc:
130
+ logger.error(
131
+ "Unexpected error loading '%s' (%s): %s",
132
+ name, module_path, exc,
133
+ )
134
+ return registered
engine/reporting.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reporting β€” HTML report generation + JSONL/CSV exports.
3
+
4
+ Produces:
5
+ runs/<run_id>/report/index.html – human-browsable report
6
+ runs/<run_id>/exports/normalized.jsonl
7
+ runs/<run_id>/exports/records.csv
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import csv
13
+ import json
14
+ import logging
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List
18
+
19
+ from engine.io_contract import (
20
+ Artifact,
21
+ EngineOutput,
22
+ InputSpec,
23
+ NormalizedRecord,
24
+ )
25
+
26
+ logger = logging.getLogger("engine.reporting")
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Jinja2 setup (lazy import so the module can be imported without jinja2)
30
+ # ---------------------------------------------------------------------------
31
+
32
+ _TEMPLATE_DIR = Path(__file__).parent / "templates"
33
+
34
+
35
+ def _render_html(template_name: str, context: Dict[str, Any]) -> str:
36
+ """Render a Jinja2 template from the engine/templates/ directory."""
37
+ try:
38
+ from jinja2 import Environment, FileSystemLoader
39
+ except ImportError:
40
+ logger.warning("jinja2 not installed β€” HTML report will be a plain summary")
41
+ return _fallback_html(context)
42
+
43
+ env = Environment(
44
+ loader=FileSystemLoader(str(_TEMPLATE_DIR)),
45
+ autoescape=True,
46
+ )
47
+ template = env.get_template(template_name)
48
+ return template.render(**context)
49
+
50
+
51
+ def _fallback_html(context: Dict[str, Any]) -> str:
52
+ """Minimal HTML when Jinja2 is unavailable."""
53
+ return (
54
+ f"<html><body><h1>MOD-OSINT Report β€” {context.get('run_id', '?')}</h1>"
55
+ f"<p>Records: {context.get('total_records', 0)}</p>"
56
+ f"<p>Generated: {context.get('generated_at', '')}</p>"
57
+ f"<p><em>Install jinja2 for the full HTML report.</em></p>"
58
+ f"</body></html>"
59
+ )
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Export helpers
64
+ # ---------------------------------------------------------------------------
65
+
66
+ def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path:
67
+ """Write records as newline-delimited JSON."""
68
+ out_path.parent.mkdir(parents=True, exist_ok=True)
69
+ with open(out_path, "w", encoding="utf-8") as f:
70
+ for r in records:
71
+ f.write(r.model_dump_json() + "\n")
72
+ logger.info("Exported %d records to %s", len(records), out_path)
73
+ return out_path
74
+
75
+
76
+ def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path:
77
+ """Write records as CSV."""
78
+ out_path.parent.mkdir(parents=True, exist_ok=True)
79
+ if not records:
80
+ out_path.write_text("")
81
+ return out_path
82
+
83
+ fieldnames = list(records[0].model_dump().keys())
84
+ with open(out_path, "w", newline="", encoding="utf-8") as f:
85
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
86
+ writer.writeheader()
87
+ for r in records:
88
+ row = r.model_dump()
89
+ # Serialize complex fields
90
+ for k, v in row.items():
91
+ if isinstance(v, (dict, list)):
92
+ row[k] = json.dumps(v, ensure_ascii=False, default=str)
93
+ elif isinstance(v, Path):
94
+ row[k] = str(v)
95
+ elif v is None:
96
+ row[k] = ""
97
+ writer.writerow(row)
98
+ logger.info("Exported %d records to %s", len(records), out_path)
99
+ return out_path
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Main report generation
104
+ # ---------------------------------------------------------------------------
105
+
106
+ def generate_report(
107
+ run_id: str,
108
+ run_dir: Path,
109
+ input_spec: InputSpec,
110
+ records: List[NormalizedRecord],
111
+ stage_outputs: Dict[str, EngineOutput],
112
+ ) -> List[Artifact]:
113
+ """
114
+ Generate the full report suite:
115
+ - HTML report at ``run_dir/report/index.html``
116
+ - JSONL export at ``run_dir/exports/normalized.jsonl``
117
+ - CSV export at ``run_dir/exports/records.csv``
118
+
119
+ Returns a list of ``Artifact`` objects.
120
+ """
121
+ report_dir = run_dir / "report"
122
+ exports_dir = run_dir / "exports"
123
+ report_dir.mkdir(parents=True, exist_ok=True)
124
+ exports_dir.mkdir(parents=True, exist_ok=True)
125
+
126
+ artifacts: List[Artifact] = []
127
+
128
+ # -- JSONL export --------------------------------------------------------
129
+ jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl")
130
+ artifacts.append(Artifact(
131
+ name="normalized.jsonl",
132
+ path=jsonl_path,
133
+ mime_type="application/jsonl",
134
+ description="All normalized records in JSONL format",
135
+ ))
136
+
137
+ # -- CSV export ----------------------------------------------------------
138
+ csv_path = export_csv(records, exports_dir / "records.csv")
139
+ artifacts.append(Artifact(
140
+ name="records.csv",
141
+ path=csv_path,
142
+ mime_type="text/csv",
143
+ description="All normalized records in CSV format",
144
+ ))
145
+
146
+ # -- HTML report ---------------------------------------------------------
147
+ preview_limit = 50
148
+ stages_data = []
149
+ for name, out in stage_outputs.items():
150
+ stages_data.append({
151
+ "stage": name,
152
+ "status": out.status.value,
153
+ "summary": out.summary,
154
+ "error": out.error,
155
+ })
156
+
157
+ input_files_data = []
158
+ for f in input_spec.files:
159
+ input_files_data.append({
160
+ "name": f.path.name,
161
+ "file_type": f.file_type.value,
162
+ "size_bytes": f.size_bytes,
163
+ "sha256": f.sha256,
164
+ })
165
+
166
+ records_preview = []
167
+ for r in records[:preview_limit]:
168
+ d = r.model_dump()
169
+ # Convert Path/enum to string for template
170
+ d["source_type"] = d.get("source_type", "")
171
+ if hasattr(d["source_type"], "value"):
172
+ d["source_type"] = d["source_type"].value
173
+ records_preview.append(d)
174
+
175
+ # Build relative paths for download links
176
+ artifacts_data = []
177
+ for a in artifacts:
178
+ try:
179
+ rel = a.path.relative_to(report_dir)
180
+ except ValueError:
181
+ rel = Path("..") / "exports" / a.path.name
182
+ artifacts_data.append({"name": a.name, "rel_path": str(rel)})
183
+
184
+ context = {
185
+ "run_id": run_id,
186
+ "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
187
+ "total_records": len(records),
188
+ "input_file_count": len(input_spec.files),
189
+ "stages": stages_data,
190
+ "input_files": input_files_data,
191
+ "records_preview": records_preview,
192
+ "preview_limit": preview_limit,
193
+ "artifacts": artifacts_data,
194
+ }
195
+
196
+ html_content = _render_html("report.html", context)
197
+ html_path = report_dir / "index.html"
198
+ html_path.write_text(html_content, encoding="utf-8")
199
+ logger.info("HTML report written to %s", html_path)
200
+
201
+ artifacts.append(Artifact(
202
+ name="index.html",
203
+ path=html_path,
204
+ mime_type="text/html",
205
+ description="Human-browsable pipeline report",
206
+ ))
207
+
208
+ return artifacts
engine/storage.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Storage β€” SQL persistence for pipeline runs.
3
+
4
+ Uses SQLite by default (``runs/<run_id>/db.sqlite``).
5
+ Set ``DATABASE_URL`` env var for Postgres (e.g. ``postgresql://user:pass@host/db``).
6
+
7
+ Tables:
8
+ normalized_records – all NormalizedRecord rows
9
+ stage_results – per-stage summary + status
10
+ artifacts – artifact metadata
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ import os
18
+ import sqlite3
19
+ from pathlib import Path
20
+ from typing import List, Optional
21
+
22
+ from engine.io_contract import Artifact, EngineOutput, NormalizedRecord
23
+
24
+ logger = logging.getLogger("engine.storage")
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Schema DDL (SQLite-compatible, works with Postgres too)
28
+ # ---------------------------------------------------------------------------
29
+
30
+ _DDL = """
31
+ CREATE TABLE IF NOT EXISTS normalized_records (
32
+ row_id TEXT PRIMARY KEY,
33
+ source_file TEXT,
34
+ source_type TEXT,
35
+ timestamp TEXT,
36
+ entity_name TEXT,
37
+ entity_phone TEXT,
38
+ entity_email TEXT,
39
+ entity_ip TEXT,
40
+ entity_domain TEXT,
41
+ entity_hash TEXT,
42
+ raw_text TEXT,
43
+ extra TEXT
44
+ );
45
+
46
+ CREATE TABLE IF NOT EXISTS stage_results (
47
+ stage TEXT PRIMARY KEY,
48
+ status TEXT,
49
+ summary TEXT,
50
+ error TEXT,
51
+ metadata TEXT
52
+ );
53
+
54
+ CREATE TABLE IF NOT EXISTS artifacts (
55
+ name TEXT PRIMARY KEY,
56
+ path TEXT,
57
+ mime_type TEXT,
58
+ description TEXT
59
+ );
60
+ """
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Storage backend
65
+ # ---------------------------------------------------------------------------
66
+
67
+ class StorageBackend:
68
+ """
69
+ Thin wrapper around a SQLite (or Postgres) connection.
70
+
71
+ For this iteration we use raw ``sqlite3``. A future iteration can
72
+ swap in SQLAlchemy / SQLModel for Postgres parity.
73
+ """
74
+
75
+ def __init__(self, db_path: Path):
76
+ self.db_path = db_path
77
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
78
+ self._conn: Optional[sqlite3.Connection] = None
79
+
80
+ # -- lifecycle -----------------------------------------------------------
81
+
82
+ def connect(self) -> None:
83
+ logger.info("Connecting to SQLite: %s", self.db_path)
84
+ self._conn = sqlite3.connect(str(self.db_path))
85
+ self._conn.executescript(_DDL)
86
+ self._conn.commit()
87
+
88
+ def close(self) -> None:
89
+ if self._conn:
90
+ self._conn.close()
91
+ self._conn = None
92
+
93
+ @property
94
+ def conn(self) -> sqlite3.Connection:
95
+ if self._conn is None:
96
+ self.connect()
97
+ assert self._conn is not None
98
+ return self._conn
99
+
100
+ # -- writes --------------------------------------------------------------
101
+
102
+ def insert_records(self, records: List[NormalizedRecord]) -> int:
103
+ """Insert normalized records. Returns count inserted."""
104
+ if not records:
105
+ return 0
106
+ sql = """
107
+ INSERT OR REPLACE INTO normalized_records
108
+ (row_id, source_file, source_type, timestamp,
109
+ entity_name, entity_phone, entity_email,
110
+ entity_ip, entity_domain, entity_hash,
111
+ raw_text, extra)
112
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
113
+ """
114
+ rows = [
115
+ (
116
+ r.row_id,
117
+ r.source_file,
118
+ r.source_type.value if r.source_type else "",
119
+ r.timestamp.isoformat() if r.timestamp else None,
120
+ r.entity_name,
121
+ r.entity_phone,
122
+ r.entity_email,
123
+ r.entity_ip,
124
+ r.entity_domain,
125
+ r.entity_hash,
126
+ r.raw_text,
127
+ json.dumps(r.extra, ensure_ascii=False, default=str),
128
+ )
129
+ for r in records
130
+ ]
131
+ self.conn.executemany(sql, rows)
132
+ self.conn.commit()
133
+ logger.info("Inserted %d normalized records", len(rows))
134
+ return len(rows)
135
+
136
+ def insert_stage_result(self, output: EngineOutput) -> None:
137
+ """Upsert a stage result row."""
138
+ sql = """
139
+ INSERT OR REPLACE INTO stage_results
140
+ (stage, status, summary, error, metadata)
141
+ VALUES (?, ?, ?, ?, ?)
142
+ """
143
+ self.conn.execute(sql, (
144
+ output.stage,
145
+ output.status.value,
146
+ output.summary,
147
+ output.error,
148
+ json.dumps(output.metadata, ensure_ascii=False, default=str),
149
+ ))
150
+ self.conn.commit()
151
+
152
+ def insert_artifact(self, artifact: Artifact) -> None:
153
+ """Upsert an artifact metadata row."""
154
+ sql = """
155
+ INSERT OR REPLACE INTO artifacts
156
+ (name, path, mime_type, description)
157
+ VALUES (?, ?, ?, ?)
158
+ """
159
+ self.conn.execute(sql, (
160
+ artifact.name,
161
+ str(artifact.path),
162
+ artifact.mime_type,
163
+ artifact.description,
164
+ ))
165
+ self.conn.commit()
166
+
167
+ # -- reads ---------------------------------------------------------------
168
+
169
+ def count_records(self) -> int:
170
+ cur = self.conn.execute("SELECT COUNT(*) FROM normalized_records")
171
+ return cur.fetchone()[0]
172
+
173
+ def fetch_all_records(self) -> List[dict]:
174
+ """Return all normalized records as dicts."""
175
+ cur = self.conn.execute("SELECT * FROM normalized_records")
176
+ cols = [d[0] for d in cur.description]
177
+ return [dict(zip(cols, row)) for row in cur.fetchall()]
178
+
179
+ def fetch_stage_results(self) -> List[dict]:
180
+ cur = self.conn.execute("SELECT * FROM stage_results")
181
+ cols = [d[0] for d in cur.description]
182
+ return [dict(zip(cols, row)) for row in cur.fetchall()]
183
+
184
+
185
+ def create_storage(db_path: Path) -> StorageBackend:
186
+ """Factory: create and connect a StorageBackend."""
187
+ backend = StorageBackend(db_path)
188
+ backend.connect()
189
+ return backend
engine/templates/report.html ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>MOD-OSINT Report β€” {{ run_id }}</title>
7
+ <style>
8
+ :root {
9
+ --bg: #0d1117;
10
+ --surface: #161b22;
11
+ --border: #30363d;
12
+ --text: #c9d1d9;
13
+ --accent: #58a6ff;
14
+ --success: #3fb950;
15
+ --error: #f85149;
16
+ --warn: #d29922;
17
+ }
18
+ * { box-sizing: border-box; margin: 0; padding: 0; }
19
+ body {
20
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
21
+ background: var(--bg);
22
+ color: var(--text);
23
+ padding: 2rem;
24
+ line-height: 1.6;
25
+ }
26
+ h1, h2, h3 { color: var(--accent); margin-bottom: 0.5rem; }
27
+ h1 { font-size: 1.8rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
28
+ h2 { font-size: 1.3rem; margin-top: 2rem; }
29
+ .meta { color: #8b949e; font-size: 0.9rem; margin-bottom: 1.5rem; }
30
+ .card {
31
+ background: var(--surface);
32
+ border: 1px solid var(--border);
33
+ border-radius: 6px;
34
+ padding: 1rem 1.2rem;
35
+ margin-bottom: 1rem;
36
+ }
37
+ .badge {
38
+ display: inline-block;
39
+ padding: 2px 8px;
40
+ border-radius: 12px;
41
+ font-size: 0.8rem;
42
+ font-weight: 600;
43
+ }
44
+ .badge-success { background: var(--success); color: #000; }
45
+ .badge-failed { background: var(--error); color: #fff; }
46
+ .badge-skipped { background: var(--warn); color: #000; }
47
+ table {
48
+ width: 100%;
49
+ border-collapse: collapse;
50
+ margin-top: 0.5rem;
51
+ font-size: 0.85rem;
52
+ }
53
+ th, td {
54
+ text-align: left;
55
+ padding: 6px 10px;
56
+ border-bottom: 1px solid var(--border);
57
+ }
58
+ th { color: var(--accent); font-weight: 600; }
59
+ tr:hover { background: rgba(88,166,255,0.05); }
60
+ .truncate { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
61
+ a { color: var(--accent); text-decoration: none; }
62
+ a:hover { text-decoration: underline; }
63
+ .downloads { display: flex; gap: 1rem; flex-wrap: wrap; margin-top: 1rem; }
64
+ .dl-btn {
65
+ display: inline-block;
66
+ padding: 8px 16px;
67
+ background: var(--accent);
68
+ color: #000;
69
+ border-radius: 6px;
70
+ font-weight: 600;
71
+ font-size: 0.85rem;
72
+ }
73
+ .dl-btn:hover { opacity: 0.85; text-decoration: none; }
74
+ </style>
75
+ </head>
76
+ <body>
77
+ <h1>πŸ•΅οΈ MOD-OSINT Pipeline Report</h1>
78
+ <p class="meta">
79
+ Run ID: <strong>{{ run_id }}</strong> &nbsp;|&nbsp;
80
+ Generated: <strong>{{ generated_at }}</strong> &nbsp;|&nbsp;
81
+ Records: <strong>{{ total_records }}</strong> &nbsp;|&nbsp;
82
+ Input files: <strong>{{ input_file_count }}</strong>
83
+ </p>
84
+
85
+ <!-- Stage Results -->
86
+ <h2>Pipeline Stages</h2>
87
+ {% for stage in stages %}
88
+ <div class="card">
89
+ <strong>{{ stage.stage }}</strong>
90
+ {% if stage.status == "success" %}
91
+ <span class="badge badge-success">βœ“ success</span>
92
+ {% elif stage.status == "failed" %}
93
+ <span class="badge badge-failed">βœ— failed</span>
94
+ {% else %}
95
+ <span class="badge badge-skipped">⊘ {{ stage.status }}</span>
96
+ {% endif %}
97
+ {% if stage.summary %}
98
+ <p style="margin-top:0.4rem; font-size:0.9rem;">{{ stage.summary }}</p>
99
+ {% endif %}
100
+ {% if stage.error %}
101
+ <p style="margin-top:0.4rem; color:var(--error); font-size:0.85rem;">Error: {{ stage.error }}</p>
102
+ {% endif %}
103
+ </div>
104
+ {% endfor %}
105
+
106
+ <!-- Input Files -->
107
+ <h2>Input Files</h2>
108
+ <div class="card">
109
+ <table>
110
+ <thead><tr><th>File</th><th>Type</th><th>Size</th><th>SHA-256</th></tr></thead>
111
+ <tbody>
112
+ {% for f in input_files %}
113
+ <tr>
114
+ <td>{{ f.name }}</td>
115
+ <td>{{ f.file_type }}</td>
116
+ <td>{{ f.size_bytes }} B</td>
117
+ <td class="truncate" title="{{ f.sha256 }}">{{ f.sha256[:16] }}…</td>
118
+ </tr>
119
+ {% endfor %}
120
+ </tbody>
121
+ </table>
122
+ </div>
123
+
124
+ <!-- Records Preview -->
125
+ <h2>Normalized Records (first {{ preview_limit }})</h2>
126
+ <div class="card" style="overflow-x:auto;">
127
+ <table>
128
+ <thead>
129
+ <tr>
130
+ <th>row_id</th>
131
+ <th>source</th>
132
+ <th>type</th>
133
+ <th>name</th>
134
+ <th>phone</th>
135
+ <th>email</th>
136
+ <th>ip</th>
137
+ <th>domain</th>
138
+ <th>raw_text</th>
139
+ </tr>
140
+ </thead>
141
+ <tbody>
142
+ {% for r in records_preview %}
143
+ <tr>
144
+ <td><code>{{ r.row_id }}</code></td>
145
+ <td>{{ r.source_file }}</td>
146
+ <td>{{ r.source_type }}</td>
147
+ <td>{{ r.entity_name or '' }}</td>
148
+ <td>{{ r.entity_phone or '' }}</td>
149
+ <td>{{ r.entity_email or '' }}</td>
150
+ <td>{{ r.entity_ip or '' }}</td>
151
+ <td>{{ r.entity_domain or '' }}</td>
152
+ <td class="truncate" title="{{ r.raw_text }}">{{ r.raw_text[:80] }}</td>
153
+ </tr>
154
+ {% endfor %}
155
+ </tbody>
156
+ </table>
157
+ </div>
158
+
159
+ <!-- Downloads -->
160
+ <h2>Downloadable Artifacts</h2>
161
+ <div class="downloads">
162
+ {% for a in artifacts %}
163
+ <a class="dl-btn" href="{{ a.rel_path }}" download>⬇ {{ a.name }}</a>
164
+ {% endfor %}
165
+ </div>
166
+
167
+ <p class="meta" style="margin-top:3rem;">
168
+ Generated by <strong>MOD-OSINT Engine v0.1.0</strong>
169
+ </p>
170
+ </body>
171
+ </html>
gui/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ MOD-OSINT GUI package.
3
+
4
+ Streamlit entrypoint: gui/streamlit_app.py
5
+ """
gui/streamlit_app.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MOD-OSINT Streamlit GUI Wizard
3
+ Wired to engine.pipeline_orchestrator.run_pipeline()
4
+
5
+ Stages:
6
+ A β€” Upload / Input selection
7
+ B β€” Settings
8
+ C β€” Run pipeline
9
+ D β€” Browse / Export results
10
+
11
+ Import safety:
12
+ This module avoids importing Streamlit at module load time so CI/tests can
13
+ import it without ScriptRunContext warnings.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import sqlite3
18
+ import tempfile
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+
23
+ _DEMO_DIR = Path("samples/demo_ingest")
24
+
25
+
26
+ def _load_yaml_defaults(path: Path) -> dict:
27
+ try:
28
+ import yaml # optional; provided by requirements-hf.txt
29
+ return yaml.safe_load(path.read_text()) or {}
30
+ except Exception:
31
+ return {}
32
+
33
+
34
+ def _write_uploads(uploads) -> Path:
35
+ """Save uploaded files into a temp dir and return the dir path."""
36
+ tmp = Path(tempfile.mkdtemp(prefix="modosint_"))
37
+ updir = tmp / "uploads"
38
+ updir.mkdir(parents=True, exist_ok=True)
39
+ for file_obj in uploads:
40
+ (updir / file_obj.name).write_bytes(file_obj.getbuffer())
41
+ return updir
42
+
43
+
44
+ def _resolve_input(session_state) -> Path | None:
45
+ """Determine input from session state (uploads > local path > demo)."""
46
+ uploads = session_state.get("_uploads")
47
+ if uploads:
48
+ return _write_uploads(uploads)
49
+
50
+ local_path = session_state.get("_local_path", "").strip()
51
+ if local_path:
52
+ path_obj = Path(local_path).expanduser()
53
+ if path_obj.exists():
54
+ return path_obj
55
+
56
+ if session_state.get("_use_demo") and _DEMO_DIR.exists():
57
+ return _DEMO_DIR
58
+
59
+ return None
60
+
61
+
62
+ def main() -> None:
63
+ """Entrypoint for `streamlit run gui/streamlit_app.py`."""
64
+ import streamlit as st
65
+ import streamlit.components.v1 as components
66
+
67
+ from engine.pipeline_orchestrator import run_pipeline
68
+ from gui.terminal_panel import render_terminal
69
+
70
+ st.set_page_config(
71
+ page_title="MOD-OSINT",
72
+ page_icon="🧠",
73
+ layout="wide",
74
+ initial_sidebar_state="expanded",
75
+ )
76
+
77
+ st.title("🧠 MOD-OSINT")
78
+ st.caption("GUI wizard -> `engine.pipeline_orchestrator.run_pipeline()`")
79
+
80
+ if "effective_config" not in st.session_state:
81
+ st.session_state["effective_config"] = {}
82
+ if "last_run_id" not in st.session_state:
83
+ st.session_state["last_run_id"] = None
84
+ if "last_run_dir" not in st.session_state:
85
+ st.session_state["last_run_dir"] = None
86
+
87
+ with st.sidebar:
88
+ render_terminal({"effective_config": st.session_state["effective_config"]})
89
+
90
+ tab_upload, tab_settings, tab_run, tab_browse = st.tabs(
91
+ ["πŸ“‚ Upload", "βš™οΈ Settings", "▢️ Run", "πŸ“Š Browse"]
92
+ )
93
+
94
+ with tab_upload:
95
+ st.subheader("A) Upload or select input")
96
+
97
+ uploads = st.file_uploader(
98
+ "Upload files (CSV, JSON, TXT, HTML, LOG)",
99
+ accept_multiple_files=True,
100
+ key="_uploads",
101
+ )
102
+ if uploads:
103
+ st.success(f"Queued {len(uploads)} file(s): {[u.name for u in uploads]}")
104
+
105
+ st.divider()
106
+
107
+ local_path = st.text_input(
108
+ "Or enter a local directory / file path",
109
+ value="",
110
+ key="_local_path",
111
+ placeholder="/path/to/data/",
112
+ )
113
+
114
+ st.divider()
115
+
116
+ st.checkbox(
117
+ f"Use built-in demo dataset (`{_DEMO_DIR}`)",
118
+ value=not bool(uploads) and not bool(local_path),
119
+ key="_use_demo",
120
+ disabled=not _DEMO_DIR.exists(),
121
+ help="Runs the pipeline against samples/demo_ingest/ for quick smoke testing.",
122
+ )
123
+
124
+ if _DEMO_DIR.exists():
125
+ demo_files = sorted(_DEMO_DIR.iterdir())
126
+ st.caption(f"Demo files: {[f.name for f in demo_files if f.is_file()]}")
127
+ else:
128
+ st.caption("`samples/demo_ingest/` not found in working directory.")
129
+
130
+ with tab_settings:
131
+ st.subheader("B) Pipeline settings")
132
+
133
+ cfg_path = Path("pipeline_config.yaml")
134
+ defaults = _load_yaml_defaults(cfg_path) if cfg_path.exists() else {}
135
+
136
+ col_left, col_right = st.columns(2)
137
+ with col_left:
138
+ offline_mode = st.toggle(
139
+ "offline_mode",
140
+ value=True,
141
+ help="Disable all outbound network calls.",
142
+ )
143
+ enable_ml = st.toggle(
144
+ "enable_ml_analysis",
145
+ value=False,
146
+ help="Enable ML/NLP stage (requires torch; off by default).",
147
+ )
148
+ with col_right:
149
+ correlation_mode = st.selectbox(
150
+ "correlation_mode",
151
+ ["basic", "in-memory"],
152
+ index=0,
153
+ help="basic = simple entity matching; in-memory = graph in RAM.",
154
+ )
155
+
156
+ effective_config: dict = defaults.copy()
157
+ effective_config.setdefault("runtime", {})
158
+ effective_config["runtime"].update(
159
+ {
160
+ "offline_mode": offline_mode,
161
+ "enable_ml_analysis": enable_ml,
162
+ "correlation_mode": correlation_mode,
163
+ }
164
+ )
165
+ st.session_state["effective_config"] = effective_config
166
+
167
+ st.markdown("**Effective config (passed to engine):**")
168
+ st.json(effective_config)
169
+
170
+ with tab_run:
171
+ st.subheader("C) Run pipeline")
172
+ st.caption("Outputs are written to `runs/<run_id>/`.")
173
+
174
+ input_path = _resolve_input(st.session_state)
175
+ if input_path:
176
+ st.info(f"Input resolved -> `{input_path}`")
177
+ else:
178
+ st.warning("No input selected. Go to Upload tab or enable demo dataset.")
179
+
180
+ run_btn = st.button("πŸš€ Run pipeline now", type="primary", disabled=input_path is None)
181
+
182
+ if run_btn and input_path:
183
+ progress = st.progress(0, text="Starting...")
184
+ log_area = st.empty()
185
+ log_lines: list[str] = []
186
+
187
+ def _log(message: str) -> None:
188
+ log_lines.append(message)
189
+ log_area.code("\n".join(log_lines[-40:]), language="bash")
190
+
191
+ _log(f"Input: {input_path}")
192
+ _log("Calling engine.pipeline_orchestrator.run_pipeline()...")
193
+ progress.progress(10, text="Normalizing files...")
194
+
195
+ try:
196
+ ctx = run_pipeline(
197
+ input_path=input_path,
198
+ config=st.session_state["effective_config"],
199
+ )
200
+ st.session_state["last_run_id"] = ctx.run_id
201
+ st.session_state["last_run_dir"] = str(ctx.run_dir)
202
+
203
+ progress.progress(90, text="Generating report...")
204
+ _log(f"Run ID: {ctx.run_id}")
205
+ _log(f"Run dir: {ctx.run_dir}")
206
+
207
+ if ctx.stage_results:
208
+ for stage_name, stage_out in ctx.stage_results.items():
209
+ _log(f" [{stage_out.status.value.upper():8s}] {stage_name}")
210
+
211
+ progress.progress(100, text="Done")
212
+ st.success(f"Pipeline complete - run `{ctx.run_id}`")
213
+ st.code(str(ctx.run_dir))
214
+ st.info("Switch to Browse tab to explore outputs.")
215
+
216
+ except Exception as exc:
217
+ progress.empty()
218
+ st.error(f"Pipeline failed: {exc}")
219
+ _log(f"ERROR: {exc}")
220
+
221
+ with tab_browse:
222
+ st.subheader("D) Browse results")
223
+
224
+ run_dir_str = st.session_state.get("last_run_dir")
225
+ if not run_dir_str:
226
+ st.info("Run the pipeline first (Stage C).")
227
+ return
228
+
229
+ run_dir = Path(run_dir_str)
230
+ report_html = run_dir / "report" / "index.html"
231
+ db_path = run_dir / "db.sqlite"
232
+ exports_dir = run_dir / "exports"
233
+ manifest_path = run_dir / "manifest.json"
234
+
235
+ col1, col2, col3, col4 = st.columns(4)
236
+ col1.metric("Run ID", st.session_state.get("last_run_id", "-"))
237
+ col2.metric("Report", "yes" if report_html.exists() else "no")
238
+ col3.metric("DB", "yes" if db_path.exists() else "no")
239
+ col4.metric("Exports", str(len(list(exports_dir.rglob("*"))) if exports_dir.exists() else 0))
240
+
241
+ if manifest_path.exists():
242
+ with st.expander("Run manifest"):
243
+ import json
244
+ st.json(json.loads(manifest_path.read_text()))
245
+
246
+ st.divider()
247
+
248
+ st.markdown("### HTML Report")
249
+ if report_html.exists():
250
+ st.markdown(f"`{report_html}`")
251
+ try:
252
+ components.html(report_html.read_text(errors="replace"), height=700, scrolling=True)
253
+ except Exception as exc:
254
+ st.warning(f"Inline render failed ({exc}). Open the path above in a browser.")
255
+
256
+ with open(report_html, "rb") as file_handle:
257
+ st.download_button(
258
+ "Download report/index.html",
259
+ data=file_handle,
260
+ file_name="index.html",
261
+ mime="text/html",
262
+ )
263
+ else:
264
+ st.info("No report/index.html yet.")
265
+
266
+ st.divider()
267
+
268
+ st.markdown("### Exports")
269
+ if exports_dir.exists():
270
+ export_files = sorted([path for path in exports_dir.rglob("*") if path.is_file()])
271
+ if export_files:
272
+ for export_file in export_files:
273
+ rel = export_file.relative_to(run_dir).as_posix()
274
+ col_path, col_download = st.columns([3, 1])
275
+ col_path.write(f"`{rel}`")
276
+ with open(export_file, "rb") as file_handle:
277
+ col_download.download_button(
278
+ "Download",
279
+ data=file_handle,
280
+ file_name=export_file.name,
281
+ key=f"dl_{rel}",
282
+ )
283
+ else:
284
+ st.info("Exports directory is empty.")
285
+ else:
286
+ st.info("No exports/ directory found.")
287
+
288
+ jsonl_path = run_dir / "normalized.jsonl"
289
+ if jsonl_path.exists():
290
+ with open(jsonl_path, "rb") as file_handle:
291
+ st.download_button(
292
+ "Download normalized.jsonl",
293
+ data=file_handle,
294
+ file_name="normalized.jsonl",
295
+ mime="application/x-ndjson",
296
+ )
297
+
298
+ st.divider()
299
+
300
+ st.markdown("### SQLite DB Preview")
301
+ if not db_path.exists():
302
+ st.info("No db.sqlite found.")
303
+ return
304
+
305
+ with open(db_path, "rb") as file_handle:
306
+ st.download_button(
307
+ "Download db.sqlite",
308
+ data=file_handle,
309
+ file_name="db.sqlite",
310
+ mime="application/x-sqlite3",
311
+ )
312
+
313
+ try:
314
+ conn = sqlite3.connect(db_path)
315
+ tables = pd.read_sql(
316
+ "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
317
+ conn,
318
+ )["name"].tolist()
319
+ if tables:
320
+ st.write("Tables:", tables)
321
+ selected_table = st.selectbox("Preview table", tables, key="db_table_sel")
322
+ dataframe = pd.read_sql(f"SELECT * FROM [{selected_table}] LIMIT 200;", conn)
323
+ st.dataframe(dataframe, use_container_width=True)
324
+ else:
325
+ st.info("DB exists but contains no tables yet.")
326
+ conn.close()
327
+ except Exception as exc:
328
+ st.warning(f"DB preview failed: {exc}")
329
+
330
+
331
+ if __name__ == "__main__":
332
+ main()
gui/terminal_panel.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ ALLOWED = {
7
+ "help",
8
+ "show config",
9
+ "list runs",
10
+ "open last report",
11
+ "tail log",
12
+ "run pipeline",
13
+ }
14
+
15
+
16
+ def _st():
17
+ """Import streamlit lazily so module import stays side-effect free."""
18
+ import streamlit as st
19
+ return st
20
+
21
+
22
+ def _runs_dir() -> Path:
23
+ return Path("runs")
24
+
25
+
26
+ def _list_runs(limit: int = 25) -> list[Path]:
27
+ runs_dir = _runs_dir()
28
+ if not runs_dir.exists():
29
+ return []
30
+ runs = [path for path in runs_dir.iterdir() if path.is_dir()]
31
+ runs.sort(key=lambda path: path.name, reverse=True)
32
+ return runs[:limit]
33
+
34
+
35
+ def _tail(path: Path, n: int = 200) -> str:
36
+ try:
37
+ lines = path.read_text(errors="replace").splitlines()
38
+ return "\n".join(lines[-n:])
39
+ except Exception as exc:
40
+ return f"[error] {exc}"
41
+
42
+
43
+ def render_terminal(state: dict) -> None:
44
+ st = _st()
45
+
46
+ st.subheader("Terminal (safe mock)")
47
+ st.caption("Allowed: help | show config | list runs | open last report | tail log | run pipeline")
48
+
49
+ out = st.session_state.get("_term_out", "")
50
+ cmd = st.text_input("Command", key="_term_cmd")
51
+
52
+ col1, col2 = st.columns([1, 1])
53
+ run = col1.button("Run", use_container_width=True)
54
+ clear = col2.button("Clear", use_container_width=True)
55
+
56
+ if clear:
57
+ st.session_state["_term_out"] = ""
58
+ st.rerun()
59
+
60
+ if run and cmd:
61
+ cmd_norm = cmd.strip().lower()
62
+ if cmd_norm not in ALLOWED:
63
+ out += f"\n$ {cmd}\n[blocked] command not allowed\n"
64
+ else:
65
+ out += f"\n$ {cmd}\n"
66
+ if cmd_norm == "help":
67
+ out += "help | show config | list runs | open last report | tail log | run pipeline\n"
68
+ elif cmd_norm == "show config":
69
+ out += json.dumps(state.get("effective_config", {}), indent=2) + "\n"
70
+ elif cmd_norm == "list runs":
71
+ runs = _list_runs()
72
+ out += "\n".join([path.name for path in runs]) + ("\n" if runs else "[none]\n")
73
+ elif cmd_norm == "open last report":
74
+ runs = _list_runs(1)
75
+ if not runs:
76
+ out += "[none]\n"
77
+ else:
78
+ report = runs[0] / "report" / "index.html"
79
+ out += f"{report.as_posix()}\n"
80
+ elif cmd_norm == "tail log":
81
+ runs = _list_runs(1)
82
+ if not runs:
83
+ out += "[none]\n"
84
+ else:
85
+ log_path = runs[0] / "pipeline.log"
86
+ out += _tail(log_path) + "\n"
87
+ elif cmd_norm == "run pipeline":
88
+ input_path = None
89
+ uploads = st.session_state.get("_uploads")
90
+
91
+ if uploads:
92
+ import tempfile
93
+ tmp = Path(tempfile.mkdtemp(prefix="modosint_term_"))
94
+ updir = tmp / "uploads"
95
+ updir.mkdir(parents=True, exist_ok=True)
96
+ for file_obj in uploads:
97
+ (updir / file_obj.name).write_bytes(file_obj.getbuffer())
98
+ input_path = updir
99
+ elif st.session_state.get("_local_path", "").strip():
100
+ path_obj = Path(st.session_state["_local_path"].strip()).expanduser()
101
+ if path_obj.exists():
102
+ input_path = path_obj
103
+ elif Path("samples/demo_ingest").exists():
104
+ input_path = Path("samples/demo_ingest")
105
+
106
+ if input_path is None:
107
+ out += "[error] no input available; upload files or enable demo dataset first\n"
108
+ else:
109
+ try:
110
+ from engine.pipeline_orchestrator import run_pipeline
111
+
112
+ ctx = run_pipeline(
113
+ input_path=input_path,
114
+ config=state.get("effective_config", {}),
115
+ )
116
+ st.session_state["last_run_id"] = ctx.run_id
117
+ st.session_state["last_run_dir"] = str(ctx.run_dir)
118
+ out += f"run_id: {ctx.run_id}\nrun_dir: {ctx.run_dir}\n"
119
+ for stage_name, stage_out in ctx.stage_results.items():
120
+ out += f" [{stage_out.status.value.upper():8s}] {stage_name}\n"
121
+ except Exception as exc:
122
+ out += f"[error] {exc}\n"
123
+
124
+ st.session_state["_term_out"] = out
125
+ st.rerun()
126
+
127
+ st.text_area("Output", value=st.session_state.get("_term_out", ""), height=260)
modules/README.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MODULES Directory
2
+
3
+ ## Purpose:
4
+ This directory houses the functional modules responsible for specific phases of the OSINT pipeline. Each subfolder represents a stage of processing from data ingestion to export.
5
+
6
+ ## Subdirectories:
7
+ - `ingestion/`: Connects to external data sources (web scrapers, APIs, sensors).
8
+ - `preprocessing/`: Cleans, deduplicates, and normalizes raw data into structured input.
9
+ - `ml_analysis/`: Applies machine learning models (e.g., classification, clustering, NLP).
10
+ - `correlation/`: Cross-references data (e.g., STIX/IOC matching, pattern detection).
11
+ - `export/`: Packages output into files, databases, APIs, or dashboards.
12
+
13
+ ## Design Philosophy:
14
+ Each module is atomic, reusable, and accepts standardized JSON inputs/outputs. Naming follows functional role, and new tools should be added under appropriate phase.
modules/__init__.py ADDED
File without changes
modules/correlation/correlate.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Correlation module β€” entity linking and cross-record correlation.
3
+
4
+ Engine contract:
5
+ run(EngineInput) -> EngineOutput
6
+
7
+ Groups records by shared entity keys (email, IP, phone, domain, hash)
8
+ and produces correlation metadata. No external dependencies (Neo4j
9
+ is optional and handled separately).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from collections import defaultdict
16
+ from typing import Any, Dict, List, Set
17
+
18
+ from engine.io_contract import (
19
+ EngineInput,
20
+ EngineOutput,
21
+ NormalizedRecord,
22
+ StageStatus,
23
+ )
24
+
25
+ logger = logging.getLogger("modules.correlation")
26
+
27
+
28
+ def _build_entity_index(records: List[NormalizedRecord]) -> Dict[str, List[str]]:
29
+ """
30
+ Build an inverted index: entity_value β†’ [row_ids].
31
+ """
32
+ index: Dict[str, List[str]] = defaultdict(list)
33
+ for r in records:
34
+ for field in ("entity_email", "entity_ip", "entity_phone",
35
+ "entity_domain", "entity_hash"):
36
+ val = getattr(r, field, None)
37
+ if val:
38
+ key = f"{field}:{val}"
39
+ index[key].append(r.row_id)
40
+ return dict(index)
41
+
42
+
43
+ def _find_clusters(index: Dict[str, List[str]]) -> List[Set[str]]:
44
+ """
45
+ Find clusters of row_ids that share at least one entity value.
46
+ Uses simple union-find.
47
+ """
48
+ parent: Dict[str, str] = {}
49
+
50
+ def find(x: str) -> str:
51
+ while parent.get(x, x) != x:
52
+ parent[x] = parent.get(parent[x], parent[x])
53
+ x = parent[x]
54
+ return x
55
+
56
+ def union(a: str, b: str) -> None:
57
+ ra, rb = find(a), find(b)
58
+ if ra != rb:
59
+ parent[ra] = rb
60
+
61
+ for entity_key, row_ids in index.items():
62
+ if len(row_ids) > 1:
63
+ first = row_ids[0]
64
+ for rid in row_ids[1:]:
65
+ union(first, rid)
66
+
67
+ clusters: Dict[str, Set[str]] = defaultdict(set)
68
+ all_ids = set()
69
+ for row_ids in index.values():
70
+ all_ids.update(row_ids)
71
+ for rid in all_ids:
72
+ root = find(rid)
73
+ clusters[root].add(rid)
74
+
75
+ # Only return clusters with 2+ members
76
+ return [c for c in clusters.values() if len(c) > 1]
77
+
78
+
79
+ def run(engine_input: EngineInput) -> EngineOutput:
80
+ """
81
+ Correlate records by shared entity values.
82
+ """
83
+ try:
84
+ records = engine_input.records
85
+ index = _build_entity_index(records)
86
+ clusters = _find_clusters(index)
87
+
88
+ # Annotate records with cluster IDs
89
+ row_to_cluster: Dict[str, int] = {}
90
+ for i, cluster in enumerate(clusters):
91
+ for rid in cluster:
92
+ row_to_cluster[rid] = i
93
+
94
+ annotated: List[NormalizedRecord] = []
95
+ for r in records:
96
+ cluster_id = row_to_cluster.get(r.row_id)
97
+ if cluster_id is not None:
98
+ extra = dict(r.extra)
99
+ extra["correlation_cluster"] = cluster_id
100
+ annotated.append(r.model_copy(update={"extra": extra}))
101
+ else:
102
+ annotated.append(r)
103
+
104
+ correlated_count = len(row_to_cluster)
105
+ return EngineOutput(
106
+ stage="correlation",
107
+ status=StageStatus.SUCCESS,
108
+ records=annotated,
109
+ summary=f"Found {len(clusters)} clusters linking {correlated_count} records",
110
+ metadata={
111
+ "cluster_count": len(clusters),
112
+ "correlated_records": correlated_count,
113
+ "entity_index_size": len(index),
114
+ },
115
+ )
116
+ except Exception as exc:
117
+ logger.error("Correlation failed: %s", exc, exc_info=True)
118
+ return EngineOutput(
119
+ stage="correlation",
120
+ status=StageStatus.FAILED,
121
+ error=str(exc),
122
+ )
123
+
124
+
125
+ if __name__ == "__main__":
126
+ print("Correlation module β€” use via engine pipeline")
modules/correlation/correlate_ioc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from schemas.osint_module_io import OSINTModuleInput, OSINTModuleOutput
4
+
5
+ def run(input_path: str) -> OSINTModuleOutput:
6
+ """
7
+ Correlation phase:
8
+ - Loads multiple module outputs if needed
9
+ - Cross-references IOCs/IOAs (MISP, Shodan, honeypots)
10
+ - Clusters related events/patterns
11
+ """
12
+ cfg = OSINTModuleInput.parse_file(input_path)
13
+ summary = "Correlation complete"
14
+ indicators = {}
15
+ confidence = 0.0
16
+ return OSINTModuleOutput(summary=summary, indicators=indicators, confidence=confidence)
17
+
18
+ if __name__ == "__main__":
19
+ parser = argparse.ArgumentParser(description="Run the Correlation module")
20
+ parser.add_argument("input", help="Path to OSINTModuleInput JSON")
21
+ args = parser.parse_args()
22
+ out = run(args.input)
23
+ print(out.json())
modules/export/export_results.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Export module β€” writes final artifacts (JSON summary, per-stage outputs).
3
+
4
+ Engine contract:
5
+ run(EngineInput) -> EngineOutput
6
+
7
+ The main report generation (HTML, CSV, JSONL) is handled by
8
+ ``engine.reporting``. This module produces a supplementary JSON
9
+ summary artifact in the run directory.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from pathlib import Path
17
+ from typing import Any, Dict
18
+
19
+ from engine.io_contract import (
20
+ Artifact,
21
+ EngineInput,
22
+ EngineOutput,
23
+ StageStatus,
24
+ )
25
+
26
+ logger = logging.getLogger("modules.export")
27
+
28
+
29
+ def run(engine_input: EngineInput) -> EngineOutput:
30
+ """
31
+ Export a JSON summary of the pipeline run.
32
+ """
33
+ try:
34
+ run_dir = Path(engine_input.run_dir)
35
+ exports_dir = run_dir / "exports"
36
+ exports_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ # Build summary
39
+ summary_data = {
40
+ "run_id": engine_input.run_id,
41
+ "total_records": len(engine_input.records),
42
+ "input_files": [
43
+ {
44
+ "name": f.path.name,
45
+ "type": f.file_type.value,
46
+ "size": f.size_bytes,
47
+ }
48
+ for f in engine_input.input_spec.files
49
+ ],
50
+ "record_sample": [
51
+ {
52
+ "row_id": r.row_id,
53
+ "source": r.source_file,
54
+ "entity_name": r.entity_name,
55
+ "entity_email": r.entity_email,
56
+ "entity_ip": r.entity_ip,
57
+ }
58
+ for r in engine_input.records[:20]
59
+ ],
60
+ }
61
+
62
+ out_path = exports_dir / "summary.json"
63
+ out_path.write_text(
64
+ json.dumps(summary_data, indent=2, ensure_ascii=False, default=str),
65
+ encoding="utf-8",
66
+ )
67
+ logger.info("Exported summary to %s", out_path)
68
+
69
+ return EngineOutput(
70
+ stage="export",
71
+ status=StageStatus.SUCCESS,
72
+ records=engine_input.records, # pass through
73
+ artifacts=[
74
+ Artifact(
75
+ name="summary.json",
76
+ path=out_path,
77
+ mime_type="application/json",
78
+ description="Pipeline run summary",
79
+ )
80
+ ],
81
+ summary=f"Exported summary.json ({len(engine_input.records)} records)",
82
+ )
83
+ except Exception as exc:
84
+ logger.error("Export failed: %s", exc, exc_info=True)
85
+ return EngineOutput(
86
+ stage="export",
87
+ status=StageStatus.FAILED,
88
+ error=str(exc),
89
+ )
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Legacy compatibility
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def export(data: Any, outpath: str) -> str:
97
+ """Legacy wrapper (deprecated). Use ``run()`` instead."""
98
+ with open(outpath, "w") as f:
99
+ f.write(str(data))
100
+ return outpath
101
+
102
+
103
+ if __name__ == "__main__":
104
+ print(export({"result": 123}, "output.txt"))
modules/ingestion/__init__.py ADDED
File without changes
modules/ingestion/gathering/web_scraper.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def scrape(url):
2
+ # Placeholder: insert scraping logic
3
+ return {"url": url, "data": "scraped_content"}
4
+
5
+ if __name__ == "__main__":
6
+ print(scrape("https://example.com"))
modules/ingestion/ingest_data.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion module β€” reads input files and produces NormalizedRecords.
3
+
4
+ Engine contract:
5
+ run(EngineInput) -> EngineOutput
6
+
7
+ The heavy lifting (file parsing, entity extraction) is delegated to
8
+ ``engine.normalize``. This module acts as the pipeline-stage wrapper.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from typing import Any, Dict
15
+
16
+ from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
17
+ from engine.normalize import normalize_files
18
+
19
+ logger = logging.getLogger("modules.ingestion")
20
+
21
+
22
+ def run(engine_input: EngineInput) -> EngineOutput:
23
+ """
24
+ Ingest files described by ``engine_input.input_spec`` and return
25
+ normalized records.
26
+ """
27
+ try:
28
+ records = normalize_files(engine_input.input_spec)
29
+ return EngineOutput(
30
+ stage="ingestion",
31
+ status=StageStatus.SUCCESS,
32
+ records=records,
33
+ summary=f"Ingested {len(records)} records from {len(engine_input.input_spec.files)} files",
34
+ )
35
+ except Exception as exc:
36
+ logger.error("Ingestion failed: %s", exc, exc_info=True)
37
+ return EngineOutput(
38
+ stage="ingestion",
39
+ status=StageStatus.FAILED,
40
+ error=str(exc),
41
+ )
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Legacy compatibility β€” keep old function signature working
46
+ # ---------------------------------------------------------------------------
47
+
48
+ def ingest(file_path: str) -> Dict[str, Any]:
49
+ """Legacy wrapper (deprecated). Use ``run()`` instead."""
50
+ return {"file": file_path, "status": "ingested"}
51
+
52
+
53
+ if __name__ == "__main__":
54
+ print(ingest("input.txt"))
modules/ml_analysis/ml_analysis.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ML Analysis module β€” entity extraction and text classification.
3
+
4
+ Engine contract:
5
+ run(EngineInput) -> EngineOutput
6
+
7
+ This is a lightweight placeholder that performs regex-based entity
8
+ extraction. When ML dependencies (torch, transformers, spacy) are
9
+ available, it can be extended to use real models.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import re
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ from engine.io_contract import (
19
+ EngineInput,
20
+ EngineOutput,
21
+ NormalizedRecord,
22
+ StageStatus,
23
+ )
24
+
25
+ logger = logging.getLogger("modules.ml_analysis")
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Lightweight entity extraction (no ML deps required)
29
+ # ---------------------------------------------------------------------------
30
+
31
+ _EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
32
+ _IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
33
+ _PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
34
+ _DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
35
+ _HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
36
+
37
+
38
+ def _enrich_record(record: NormalizedRecord) -> NormalizedRecord:
39
+ """
40
+ Enrich a record by extracting entities from raw_text if not already set.
41
+ """
42
+ text = record.raw_text
43
+ updates: Dict[str, Any] = {}
44
+
45
+ if not record.entity_email:
46
+ m = _EMAIL_RE.search(text)
47
+ if m:
48
+ updates["entity_email"] = m.group(0).lower()
49
+
50
+ if not record.entity_ip:
51
+ m = _IP_RE.search(text)
52
+ if m:
53
+ updates["entity_ip"] = m.group(0)
54
+
55
+ if not record.entity_phone:
56
+ m = _PHONE_RE.search(text)
57
+ if m:
58
+ updates["entity_phone"] = m.group(0)
59
+
60
+ if not record.entity_domain:
61
+ m = _DOMAIN_RE.search(text)
62
+ if m:
63
+ updates["entity_domain"] = m.group(0).lower()
64
+
65
+ if not record.entity_hash:
66
+ m = _HASH_RE.search(text)
67
+ if m:
68
+ updates["entity_hash"] = m.group(0).lower()
69
+
70
+ if updates:
71
+ return record.model_copy(update=updates)
72
+ return record
73
+
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Engine contract
77
+ # ---------------------------------------------------------------------------
78
+
79
+ def run(engine_input: EngineInput) -> EngineOutput:
80
+ """
81
+ Run ML analysis / entity enrichment on all records.
82
+ """
83
+ try:
84
+ enriched: List[NormalizedRecord] = []
85
+ enrichment_count = 0
86
+
87
+ for record in engine_input.records:
88
+ new_record = _enrich_record(record)
89
+ if new_record is not record:
90
+ enrichment_count += 1
91
+ enriched.append(new_record)
92
+
93
+ return EngineOutput(
94
+ stage="analysis",
95
+ status=StageStatus.SUCCESS,
96
+ records=enriched,
97
+ summary=f"Analyzed {len(enriched)} records, enriched {enrichment_count}",
98
+ metadata={"enriched_count": enrichment_count},
99
+ )
100
+ except Exception as exc:
101
+ logger.error("ML analysis failed: %s", exc, exc_info=True)
102
+ return EngineOutput(
103
+ stage="analysis",
104
+ status=StageStatus.FAILED,
105
+ error=str(exc),
106
+ )
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # Legacy compatibility
111
+ # ---------------------------------------------------------------------------
112
+
113
+ def analyze(data: Any) -> Dict[str, Any]:
114
+ """Legacy wrapper (deprecated). Use ``run()`` instead."""
115
+ return {"input": data, "prediction": "none"}
116
+
117
+
118
+ if __name__ == "__main__":
119
+ print(analyze("test"))
modules/preprocessing/preprocess_data.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Preprocessing module β€” cleans and normalizes raw text in records.
3
+
4
+ Engine contract:
5
+ run(EngineInput) -> EngineOutput
6
+
7
+ Applies basic text cleaning to each record's ``raw_text`` field:
8
+ strip whitespace, normalize unicode, collapse whitespace runs.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import re
15
+ import unicodedata
16
+ from typing import Any, Dict, List
17
+
18
+ from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
19
+
20
+ logger = logging.getLogger("modules.preprocessing")
21
+
22
+
23
+ def _clean_text(text: str) -> str:
24
+ """Basic text normalization."""
25
+ # Unicode NFKC normalization
26
+ text = unicodedata.normalize("NFKC", text)
27
+ # Strip leading/trailing whitespace
28
+ text = text.strip()
29
+ # Collapse multiple whitespace to single space
30
+ text = re.sub(r"\s+", " ", text)
31
+ return text
32
+
33
+
34
+ def run(engine_input: EngineInput) -> EngineOutput:
35
+ """
36
+ Preprocess all records: clean ``raw_text``, normalize entity fields.
37
+ """
38
+ try:
39
+ cleaned: List[NormalizedRecord] = []
40
+ for record in engine_input.records:
41
+ # Create a copy with cleaned text
42
+ updated = record.model_copy(update={
43
+ "raw_text": _clean_text(record.raw_text),
44
+ "entity_name": record.entity_name.strip() if record.entity_name else None,
45
+ "entity_email": record.entity_email.strip().lower() if record.entity_email else None,
46
+ "entity_domain": record.entity_domain.strip().lower() if record.entity_domain else None,
47
+ })
48
+ cleaned.append(updated)
49
+
50
+ return EngineOutput(
51
+ stage="preprocessing",
52
+ status=StageStatus.SUCCESS,
53
+ records=cleaned,
54
+ summary=f"Preprocessed {len(cleaned)} records",
55
+ )
56
+ except Exception as exc:
57
+ logger.error("Preprocessing failed: %s", exc, exc_info=True)
58
+ return EngineOutput(
59
+ stage="preprocessing",
60
+ status=StageStatus.FAILED,
61
+ error=str(exc),
62
+ )
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Legacy compatibility
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def preprocess(text: str) -> str:
70
+ """Legacy wrapper (deprecated). Use ``run()`` instead."""
71
+ return text.strip().lower()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ print(preprocess(" This is RAW DATA. "))
pipeline_config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai:
2
+ mode: offline
3
+ provider: local_gguf
4
+ fallbacks:
5
+ - local_gguf
6
+ - torch_tfidf
7
+ gguf_path: models/gguf/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
8
+ n_ctx: 2048
9
+ n_threads: 8
10
+ n_gpu_layers: 0
11
+ temperature: 0.2
12
+ top_p: 0.95
13
+ max_tokens: 256
14
+ remote:
15
+ enabled: false
requirements-hf.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MOD-OSINT β€” Hugging Face Docker Space runtime requirements
2
+ # Intentionally lean: no torch, no llama-cpp, no heavy ML.
3
+ # ML analysis is toggled OFF by default in the GUI.
4
+
5
+ # GUI
6
+ streamlit>=1.35.0
7
+
8
+ # Data handling
9
+ pandas>=2.0.0
10
+ pyyaml>=6.0.2
11
+
12
+ # Engine core
13
+ pydantic>=2.10.0
14
+ jinja2>=3.1.4
15
+
16
+ # Security
17
+ certifi>=2024.12.14
samples/demo_ingest/example.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ name,phone
2
+ John,8105551212
samples/demo_ingest/example.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <html><body>demo html</body></html>
samples/demo_ingest/example.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"note":"demo"}
samples/demo_ingest/example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ hello beta
scripts/docker_entrypoint.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # MOD-OSINT Docker entrypoint β€” startup self-check + Streamlit launch
3
+ set -euo pipefail
4
+
5
+ PORT="${STREAMLIT_PORT:-7860}"
6
+ HOST="${STREAMLIT_HOST:-0.0.0.0}"
7
+
8
+ if ! [[ "$PORT" =~ ^[0-9]+$ ]] || [ "$PORT" -lt 1 ] || [ "$PORT" -gt 65535 ]; then
9
+ echo "[fatal] Invalid STREAMLIT_PORT: ${PORT}" >&2
10
+ exit 1
11
+ fi
12
+
13
+ required_files=(
14
+ "gui/streamlit_app.py"
15
+ "engine/pipeline_orchestrator.py"
16
+ "requirements-hf.txt"
17
+ )
18
+ for file_path in "${required_files[@]}"; do
19
+ if [ ! -f "$file_path" ]; then
20
+ echo "[fatal] Missing required file: $file_path" >&2
21
+ exit 1
22
+ fi
23
+ done
24
+
25
+ for dir_path in runs logs; do
26
+ mkdir -p "$dir_path"
27
+ if [ ! -w "$dir_path" ]; then
28
+ echo "[fatal] Directory not writable: $dir_path" >&2
29
+ exit 1
30
+ fi
31
+ done
32
+
33
+ python3 - <<'PY'
34
+ import importlib
35
+ import sys
36
+
37
+ print(f"[startup] Python: {sys.version.split()[0]}")
38
+ for mod in ("streamlit", "pandas", "yaml"):
39
+ importlib.import_module(mod)
40
+ print("[startup] Dependency import check: OK")
41
+ PY
42
+
43
+ echo "════════════════════════════════════════════════════════"
44
+ echo " MOD-OSINT | Streamlit GUI"
45
+ echo " Binding: ${HOST}:${PORT}"
46
+ echo " Health URL: http://localhost:${PORT}/_stcore/health"
47
+ echo "════════════════════════════════════════════════════════"
48
+
49
+ if [ "${MODOSINT_STARTUP_CHECK_ONLY:-0}" = "1" ]; then
50
+ echo "[startup] MODOSINT_STARTUP_CHECK_ONLY=1 -> exiting after checks"
51
+ exit 0
52
+ fi
53
+
54
+ exec streamlit run gui/streamlit_app.py \
55
+ --server.address="${HOST}" \
56
+ --server.port="${PORT}" \
57
+ --server.enableCORS=false \
58
+ --server.enableXsrfProtection=false \
59
+ --server.headless=true \
60
+ --browser.gatherUsageStats=false