Spaces:

moddux
/

mod-osint

Runtime error

App Files Files Community

moddux commited on Feb 27

Commit

b75c637

0 Parent(s):

deploy: HF sanitized GUI snapshot

Browse files

Files changed (35) hide show

.dockerignore +59 -0
.github/workflows/hf_sync.yml +126 -0
.gitignore +84 -0
Dockerfile +56 -0
LICENSE +5 -0
README.md +38 -0
engine/__init__.py +13 -0
engine/__main__.py +6 -0
engine/io_contract.py +144 -0
engine/normalize.py +236 -0
engine/pipeline_orchestrator.py +275 -0
engine/registry.py +134 -0
engine/reporting.py +208 -0
engine/storage.py +189 -0
engine/templates/report.html +171 -0
gui/__init__.py +5 -0
gui/streamlit_app.py +332 -0
gui/terminal_panel.py +127 -0
modules/README.txt +14 -0
modules/__init__.py +0 -0
modules/correlation/correlate.py +126 -0
modules/correlation/correlate_ioc.py +23 -0
modules/export/export_results.py +104 -0
modules/ingestion/__init__.py +0 -0
modules/ingestion/gathering/web_scraper.py +6 -0
modules/ingestion/ingest_data.py +54 -0
modules/ml_analysis/ml_analysis.py +119 -0
modules/preprocessing/preprocess_data.py +75 -0
pipeline_config.yaml +15 -0
requirements-hf.txt +17 -0
samples/demo_ingest/example.csv +2 -0
samples/demo_ingest/example.html +1 -0
samples/demo_ingest/example.json +1 -0
samples/demo_ingest/example.txt +1 -0
scripts/docker_entrypoint.sh +60 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,59 @@

+# Version control
+.git/
+.gitignore
+# Python caches
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+.eggs/
+# Virtual environments
+.venv/
+venv/
+env/
+# Runtime outputs — never bake into image
+runs/
+artifacts/
+logs/
+*.log
+*.sqlite
+*.db
+# Secrets / env
+*.env
+.env
+.env.*
+Vault/
+# macOS
+.DS_Store
+.AppleDouble
+# CI / audit artefacts
+.branch_audit/
+*.csv
+*_output.json
+safety_report.json
+# Heavy model files
+models/gguf/
+ml_models/
+*.gguf
+*.bin
+*.pt
+*.pth
+# Dev-only docs / notebooks
+*.ipynb
+*.odt
+*.tex
+*.pdf
+# Legacy dashboard (not served by container)
+dashboard/

.github/workflows/hf_sync.yml ADDED Viewed

	@@ -0,0 +1,126 @@

+name: Sync to HF Space
+on:
+  push:
+    branches:
+      - beta/sanitized-minimal
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Branch or SHA to deploy (default: beta/sanitized-minimal)"
+        required: false
+        default: "beta/sanitized-minimal"
+concurrency:
+  group: hf-space-sync
+  cancel-in-progress: true
+jobs:
+  hf-sync:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - name: Checkout deployment ref
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.ref }}
+      - name: Resolve checked out ref
+        id: ref
+        run: |
+          BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD)
+          if [ "$BRANCH_NAME" = "HEAD" ]; then
+            BRANCH_NAME=$(git name-rev --name-only --exclude=tags/* HEAD | sed 's#^remotes/origin/##')
+          fi
+          echo "branch_name=$BRANCH_NAME" >> "$GITHUB_OUTPUT"
+          echo "Deploying ref: $BRANCH_NAME @ $(git rev-parse --short HEAD)"
+      - name: Enforce sanitized deployment policy
+        run: |
+          set -euo pipefail
+          echo "Checking required runtime files..."
+          required_files=(
+            "README.md"
+            "Dockerfile"
+            "requirements-hf.txt"
+            "gui/streamlit_app.py"
+            "engine/pipeline_orchestrator.py"
+            "scripts/docker_entrypoint.sh"
+          )
+          for f in "${required_files[@]}"; do
+            [ -f "$f" ] || { echo "::error::Missing required file: $f"; exit 1; }
+          done
+          echo "Checking blocked paths/files..."
+          blocked_paths=(
+            "Docs"
+            "Docs/audit"
+            "Vault"
+            "logs"
+            "runs"
+            ".env"
+            "TODO.md"
+            "docs/progress.md"
+          )
+          violations=0
+          for p in "${blocked_paths[@]}"; do
+            if [ -e "$p" ]; then
+              echo "::error::Blocked path present in deployment ref: $p"
+              violations=1
+            fi
+          done
+          [ "$violations" -eq 0 ]
+          echo "Checking for obvious secret patterns..."
+          if command -v rg >/dev/null 2>&1; then
+            rg -n --hidden \
+              -g '!.git/**' \
+              -g '!**/*.md' \
+              -e 'AKIA[0-9A-Z]{16}' \
+              -e 'ASIA[0-9A-Z]{16}' \
+              -e 'ghp_[A-Za-z0-9]{36}' \
+              -e 'hf_[A-Za-z0-9]{30,}' \
+              -e 'sk_live_[0-9A-Za-z]{20,}' \
+              -e '-----BEGIN (RSA|EC|OPENSSH|DSA) PRIVATE KEY-----' \
+              -e 'xox[baprs]-[A-Za-z0-9-]{10,}' \
+              . && { echo "::error::Potential secret detected"; exit 1; } || true
+          fi
+      - name: Check large files (>10 MB)
+        run: |
+          set -euo pipefail
+          large_files=$(find . -not -path './.git/*' -type f -size +10M 2>/dev/null || true)
+          if [ -n "$large_files" ]; then
+            echo "::warning::Large files detected (>10MB):"
+            echo "$large_files"
+            echo "Consider Git LFS for required large assets."
+          else
+            echo "No large files >10MB found."
+          fi
+      - name: Push to HF Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          set -euo pipefail
+          if [ -z "${HF_TOKEN:-}" ]; then
+            echo "::error::HF_TOKEN secret is not set. Configure it in GitHub Actions secrets."
+            exit 1
+          fi
+          git config user.email "ci@github.com"
+          git config user.name "CI Bot"
+          git remote remove hf 2>/dev/null || true
+          git remote add hf "https://moddux:${HF_TOKEN}@huggingface.co/spaces/moddux/mod-osint"
+          echo "Pushing $(git rev-parse --short HEAD) from '${{ steps.ref.outputs.branch_name }}' -> HF main"
+          git push hf HEAD:main --force-with-lease
+          echo "HF deploy push complete."
+          echo "Logs: https://huggingface.co/spaces/moddux/mod-osint/logs"

.gitignore ADDED Viewed

	@@ -0,0 +1,84 @@

+# ===== Python =====
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+*.egg
+dist/
+build/
+*.whl
+# ===== Virtual environments =====
+.venv/
+venv/
+env/
+# ===== IDE / OS =====
+.git/
+.DS_Store
+._*
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# ===== Environment / secrets =====
+*.env
+.env.*
+# ===== Logs =====
+*.log
+logs/
+runner/logs/
+# ===== Databases =====
+*.sqlite
+*.sqlite3
+*.db
+# ===== Run artifacts (engine output) =====
+runs/
+output/
+*_output.json
+*_output.csv
+# ===== Legacy generated artifacts =====
+artifacts/*.json
+artifacts/*.csv
+artifacts/*.html
+artifacts/*.svg
+artifacts/*.pdf
+NETLOG*
+DNSESS*
+graph.cypher
+# ===== Data directory (user-supplied, not tracked) =====
+Data/
+# ===== ML models (large binaries) =====
+venv/models/gguf/*.gguf
+models/
+*.gguf
+*.bin
+*.safetensors
+# ===== Nested repo (do not track) =====
+MOD_OSINT/
+# ===== Legacy uppercase Modules (quarantined, kept on disk) =====
+# NOTE: On case-insensitive filesystems (macOS), we cannot gitignore
+# "Modules/" without also blocking "modules/".  The uppercase files
+# were removed from the index in commit 1284d1a.  We rely on the
+# index removal rather than .gitignore for Modules/.
+# ===== Misc generated =====
+pipeline_status.json
+*.zip
+*.tar.gz
+htmlcov/
+.coverage
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/

Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+# MOD-OSINT — Hugging Face Docker Space
+# Runs the GUI wizard wired to engine/pipeline_orchestrator.py on port 7860.
+FROM python:3.11-slim
+ARG BUILD_DATE="unknown"
+ARG VCS_REF="unknown"
+LABEL maintainer="moddux" \
+      org.opencontainers.image.title="MOD-OSINT" \
+      org.opencontainers.image.description="MOD-OSINT Streamlit GUI for HF Docker Space" \
+      org.opencontainers.image.source="https://github.com/moddux/MOD-OSINT" \
+      org.opencontainers.image.created="${BUILD_DATE}" \
+      org.opencontainers.image.revision="${VCS_REF}"
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
+    STREAMLIT_SERVER_HEADLESS=true \
+    STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
+    STREAMLIT_SERVER_PORT=7860
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        git \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install runtime dependencies first for cache reuse.
+COPY requirements-hf.txt ./requirements-hf.txt
+RUN python -m pip install --upgrade pip \
+    && pip install --no-cache-dir -r requirements-hf.txt
+# Create runtime user before copying app files.
+RUN useradd -m -u 1000 appuser
+# Copy application source (honors .dockerignore).
+COPY --chown=appuser:appuser . .
+# Runtime dirs and entrypoint permissions.
+RUN mkdir -p /app/runs /app/logs \
+    && chown -R appuser:appuser /app/runs /app/logs \
+    && chmod 775 /app/runs /app/logs \
+    && chmod +x /app/scripts/docker_entrypoint.sh
+USER appuser
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=5 \
+  CMD curl -fsS http://127.0.0.1:7860/_stcore/health || exit 1
+ENTRYPOINT ["bash", "scripts/docker_entrypoint.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,5 @@

+MIT License
+Copyright (c) 2025
+Permission is hereby granted, free of charge, to any person obtaining a copy...

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+title: mod-osint
+emoji: "🧠"
+colorFrom: gray
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# MOD-OSINT (Beta Sanitized Build)
+This branch is the sanitized beta deployment bundle for:
+- Hugging Face Space: https://huggingface.co/spaces/moddux/mod-osint
+What is included:
+- Runtime engine (`engine/`)
+- Runtime modules (`modules/`)
+- Streamlit GUI (`gui/`)
+- Docker runtime files (`Dockerfile`, `requirements-hf.txt`, `scripts/docker_entrypoint.sh`)
+What is excluded:
+- Audit logs and runtime logs
+- Development TODO/progress notes
+- Internal vault/secrets files
+- Non-runtime integrations and large artifacts
+## Local Run
+```bash
+docker build -t mod-osint-beta .
+docker run --rm -p 7860:7860 mod-osint-beta
+```
+Then open: http://localhost:7860

engine/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+MOD-OSINT Engine — centralized pipeline orchestration.
+Submodules:
+    io_contract  – Pydantic v2 typed IO schemas
+    registry     – module registration + dependency graph
+    normalize    – canonical schema + entity linking
+    storage      – SQL storage (SQLite / Postgres)
+    reporting    – HTML report + JSONL/CSV exports
+    pipeline_orchestrator – single entrypoint
+"""
+__version__ = "0.1.0"

engine/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Allow ``python -m engine`` to run the pipeline orchestrator."""
+from engine.pipeline_orchestrator import main
+if __name__ == "__main__":
+    main()

engine/io_contract.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+IO Contract — typed schemas for the MOD-OSINT engine.
+Every pipeline module must accept ``EngineInput`` and return ``EngineOutput``.
+The engine orchestrator builds ``RunContext`` which carries paths and DB handles.
+"""
+from __future__ import annotations
+import uuid
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+class FileType(str, Enum):
+    CSV = "csv"
+    JSON = "json"
+    TXT = "txt"
+    HTML = "html"
+    LOG = "log"
+    UNKNOWN = "unknown"
+class StageStatus(str, Enum):
+    PENDING = "pending"
+    RUNNING = "running"
+    SUCCESS = "success"
+    FAILED = "failed"
+    SKIPPED = "skipped"
+# ---------------------------------------------------------------------------
+# Input specs
+# ---------------------------------------------------------------------------
+class InputFile(BaseModel):
+    """Descriptor for a single ingested file."""
+    path: Path
+    file_type: FileType = FileType.UNKNOWN
+    size_bytes: int = 0
+    sha256: str = ""
+class InputSpec(BaseModel):
+    """Describes the full set of input data for a pipeline run."""
+    input_dir: Path
+    files: List[InputFile] = Field(default_factory=list)
+# ---------------------------------------------------------------------------
+# Normalized records
+# ---------------------------------------------------------------------------
+class NormalizedRecord(BaseModel):
+    """
+    Canonical record produced by the normalization stage.
+    Every record gets a deterministic ``row_id`` and optional entity-linking
+    keys so downstream modules can join/correlate across sources.
+    """
+    row_id: str = Field(default_factory=lambda: uuid.uuid4().hex[:12])
+    source_file: str = ""
+    source_type: FileType = FileType.UNKNOWN
+    timestamp: Optional[datetime] = None
+    entity_name: Optional[str] = None
+    entity_phone: Optional[str] = None
+    entity_email: Optional[str] = None
+    entity_ip: Optional[str] = None
+    entity_domain: Optional[str] = None
+    entity_hash: Optional[str] = None
+    raw_text: str = ""
+    extra: Dict[str, Any] = Field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# Artifacts
+# ---------------------------------------------------------------------------
+class Artifact(BaseModel):
+    """A single output artifact produced by a module."""
+    name: str
+    path: Path
+    mime_type: str = "application/octet-stream"
+    description: str = ""
+# ---------------------------------------------------------------------------
+# Engine IO (module contract)
+# ---------------------------------------------------------------------------
+class EngineInput(BaseModel):
+    """
+    Standard input passed to every pipeline module's ``run()`` function.
+    Modules receive the normalized records from prior stages plus the
+    run context (paths, config).
+    """
+    run_id: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6])
+    input_spec: InputSpec
+    records: List[NormalizedRecord] = Field(default_factory=list)
+    config: Dict[str, Any] = Field(default_factory=dict)
+    run_dir: Path = Path("runs/default")
+    previous_artifacts: List[Artifact] = Field(default_factory=list)
+class EngineOutput(BaseModel):
+    """
+    Standard output returned by every pipeline module's ``run()`` function.
+    """
+    stage: str
+    status: StageStatus = StageStatus.SUCCESS
+    records: List[NormalizedRecord] = Field(default_factory=list)
+    artifacts: List[Artifact] = Field(default_factory=list)
+    summary: str = ""
+    error: Optional[str] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# Run context (internal, built by orchestrator)
+# ---------------------------------------------------------------------------
+class RunContext(BaseModel):
+    """
+    Internal context object built by the orchestrator for a single run.
+    Carries the run directory, DB path, and accumulated state.
+    """
+    run_id: str
+    run_dir: Path
+    db_path: Path
+    input_spec: InputSpec
+    config: Dict[str, Any] = Field(default_factory=dict)
+    stage_results: Dict[str, EngineOutput] = Field(default_factory=dict)
+    class Config:
+        arbitrary_types_allowed = True

engine/normalize.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Normalize — canonical schema builder + entity linking keys.
+Reads raw files from the input directory, detects file types, parses
+them into ``NormalizedRecord`` instances with deterministic row IDs
+and entity-linking fields.
+"""
+from __future__ import annotations
+import csv
+import hashlib
+import io
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from engine.io_contract import FileType, InputFile, InputSpec, NormalizedRecord
+logger = logging.getLogger("engine.normalize")
+# ---------------------------------------------------------------------------
+# File-type detection
+# ---------------------------------------------------------------------------
+_EXT_MAP: Dict[str, FileType] = {
+    ".csv": FileType.CSV,
+    ".json": FileType.JSON,
+    ".txt": FileType.TXT,
+    ".html": FileType.HTML,
+    ".htm": FileType.HTML,
+    ".log": FileType.LOG,
+}
+def detect_file_type(path: Path) -> FileType:
+    """Detect file type from extension."""
+    return _EXT_MAP.get(path.suffix.lower(), FileType.UNKNOWN)
+def _file_sha256(path: Path) -> str:
+    """Compute SHA-256 hex digest of a file."""
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+# ---------------------------------------------------------------------------
+# Build InputSpec from a directory
+# ---------------------------------------------------------------------------
+def build_input_spec(input_dir: Path) -> InputSpec:
+    """
+    Scan *input_dir* for supported files and return an ``InputSpec``.
+    """
+    input_dir = Path(input_dir)
+    files: List[InputFile] = []
+    if input_dir.is_file():
+        # Single file mode
+        ft = detect_file_type(input_dir)
+        files.append(InputFile(
+            path=input_dir,
+            file_type=ft,
+            size_bytes=input_dir.stat().st_size,
+            sha256=_file_sha256(input_dir),
+        ))
+        input_dir = input_dir.parent
+    else:
+        for p in sorted(input_dir.iterdir()):
+            if p.is_file() and not p.name.startswith("."):
+                ft = detect_file_type(p)
+                files.append(InputFile(
+                    path=p,
+                    file_type=ft,
+                    size_bytes=p.stat().st_size,
+                    sha256=_file_sha256(p),
+                ))
+    logger.info("InputSpec: %d files from %s", len(files), input_dir)
+    return InputSpec(input_dir=input_dir, files=files)
+# ---------------------------------------------------------------------------
+# Deterministic row ID
+# ---------------------------------------------------------------------------
+def _make_row_id(source_file: str, index: int, content_hash: str) -> str:
+    """
+    Deterministic row ID = first 12 hex chars of SHA-256(source_file + index + content).
+    """
+    raw = f"{source_file}:{index}:{content_hash}"
+    return hashlib.sha256(raw.encode()).hexdigest()[:12]
+# ---------------------------------------------------------------------------
+# Entity extraction helpers (lightweight, no ML)
+# ---------------------------------------------------------------------------
+_EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
+_PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
+_DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
+_HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
+def _extract_entities(text: str) -> Dict[str, Optional[str]]:
+    """Extract first occurrence of common entity types from text."""
+    email_m = _EMAIL_RE.search(text)
+    ip_m = _IP_RE.search(text)
+    phone_m = _PHONE_RE.search(text)
+    domain_m = _DOMAIN_RE.search(text)
+    hash_m = _HASH_RE.search(text)
+    return {
+        "entity_email": email_m.group(0) if email_m else None,
+        "entity_ip": ip_m.group(0) if ip_m else None,
+        "entity_phone": phone_m.group(0) if phone_m else None,
+        "entity_domain": domain_m.group(0) if domain_m else None,
+        "entity_hash": hash_m.group(0) if hash_m else None,
+    }
+# ---------------------------------------------------------------------------
+# Parsers per file type
+# ---------------------------------------------------------------------------
+def _parse_csv(path: Path) -> List[NormalizedRecord]:
+    records: List[NormalizedRecord] = []
+    with open(path, newline="", encoding="utf-8", errors="replace") as f:
+        reader = csv.DictReader(f)
+        for idx, row in enumerate(reader):
+            text = json.dumps(row, ensure_ascii=False)
+            content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
+            entities = _extract_entities(text)
+            # Try to pick up common column names
+            name = row.get("name") or row.get("Name") or row.get("entity_name")
+            phone = row.get("phone") or row.get("Phone") or row.get("entity_phone")
+            email = row.get("email") or row.get("Email") or row.get("entity_email")
+            records.append(NormalizedRecord(
+                row_id=_make_row_id(str(path), idx, content_hash),
+                source_file=str(path.name),
+                source_type=FileType.CSV,
+                entity_name=name or entities.get("entity_name"),
+                entity_phone=phone or entities.get("entity_phone"),
+                entity_email=email or entities.get("entity_email"),
+                entity_ip=entities.get("entity_ip"),
+                entity_domain=entities.get("entity_domain"),
+                entity_hash=entities.get("entity_hash"),
+                raw_text=text,
+                extra=dict(row),
+            ))
+    return records
+def _parse_json(path: Path) -> List[NormalizedRecord]:
+    records: List[NormalizedRecord] = []
+    with open(path, encoding="utf-8", errors="replace") as f:
+        data = json.load(f)
+    # Handle both single object and list of objects
+    items = data if isinstance(data, list) else [data]
+    for idx, item in enumerate(items):
+        text = json.dumps(item, ensure_ascii=False) if isinstance(item, dict) else str(item)
+        content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
+        entities = _extract_entities(text)
+        extra = item if isinstance(item, dict) else {"value": item}
+        records.append(NormalizedRecord(
+            row_id=_make_row_id(str(path), idx, content_hash),
+            source_file=str(path.name),
+            source_type=FileType.JSON,
+            entity_name=extra.get("name") if isinstance(extra, dict) else None,
+            entity_email=entities.get("entity_email"),
+            entity_ip=entities.get("entity_ip"),
+            entity_phone=entities.get("entity_phone"),
+            entity_domain=entities.get("entity_domain"),
+            entity_hash=entities.get("entity_hash"),
+            raw_text=text,
+            extra=extra,
+        ))
+    return records
+def _parse_text(path: Path, file_type: FileType = FileType.TXT) -> List[NormalizedRecord]:
+    """Parse plain text / HTML / log files — one record per non-empty line."""
+    records: List[NormalizedRecord] = []
+    with open(path, encoding="utf-8", errors="replace") as f:
+        lines = f.readlines()
+    for idx, line in enumerate(lines):
+        line = line.strip()
+        if not line:
+            continue
+        content_hash = hashlib.sha256(line.encode()).hexdigest()[:16]
+        entities = _extract_entities(line)
+        records.append(NormalizedRecord(
+            row_id=_make_row_id(str(path), idx, content_hash),
+            source_file=str(path.name),
+            source_type=file_type,
+            entity_email=entities.get("entity_email"),
+            entity_ip=entities.get("entity_ip"),
+            entity_phone=entities.get("entity_phone"),
+            entity_domain=entities.get("entity_domain"),
+            entity_hash=entities.get("entity_hash"),
+            raw_text=line,
+        ))
+    return records
+# ---------------------------------------------------------------------------
+# Main normalization entry point
+# ---------------------------------------------------------------------------
+def normalize_files(input_spec: InputSpec) -> List[NormalizedRecord]:
+    """
+    Parse all files in *input_spec* and return a flat list of
+    ``NormalizedRecord`` instances.
+    """
+    all_records: List[NormalizedRecord] = []
+    for f in input_spec.files:
+        try:
+            if f.file_type == FileType.CSV:
+                recs = _parse_csv(f.path)
+            elif f.file_type == FileType.JSON:
+                recs = _parse_json(f.path)
+            elif f.file_type in (FileType.TXT, FileType.LOG):
+                recs = _parse_text(f.path, f.file_type)
+            elif f.file_type == FileType.HTML:
+                recs = _parse_text(f.path, FileType.HTML)
+            else:
+                recs = _parse_text(f.path, FileType.UNKNOWN)
+            logger.info("Parsed %d records from %s (%s)", len(recs), f.path.name, f.file_type.value)
+            all_records.extend(recs)
+        except Exception as exc:
+            logger.error("Failed to parse %s: %s", f.path, exc)
+    logger.info("Total normalized records: %d", len(all_records))
+    return all_records

engine/pipeline_orchestrator.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+Pipeline Orchestrator — single entrypoint for MOD-OSINT pipeline execution.
+Usage:
+    python -m engine.pipeline_orchestrator --input samples/demo_ingest/
+Flow:
+    1. Build InputSpec from --input path
+    2. Normalize files → NormalizedRecords
+    3. Discover & register pipeline modules
+    4. Run each stage in order: ingestion → preprocessing → analysis → correlation → export
+    5. Store records + stage results in SQLite (or Postgres)
+    6. Generate HTML report + JSONL/CSV exports
+    7. Write run manifest
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List
+from engine.io_contract import (
+    Artifact,
+    EngineInput,
+    EngineOutput,
+    InputSpec,
+    NormalizedRecord,
+    RunContext,
+    StageStatus,
+)
+from engine.normalize import build_input_spec, normalize_files
+from engine.registry import discover_and_register, get_ordered_stages, get_stage
+from engine.reporting import generate_report
+from engine.storage import StorageBackend, create_storage
+# ---------------------------------------------------------------------------
+# Logging setup
+# ---------------------------------------------------------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("engine.orchestrator")
+# ---------------------------------------------------------------------------
+# Run directory
+# ---------------------------------------------------------------------------
+def _create_run_dir(base: Path = Path("runs")) -> tuple[str, Path]:
+    """Create a deterministic run directory and return (run_id, run_dir)."""
+    run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6]
+    run_dir = base / run_id
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return run_id, run_dir
+# ---------------------------------------------------------------------------
+# Core pipeline execution
+# ---------------------------------------------------------------------------
+def run_pipeline(
+    input_path: str | Path,
+    config: Dict | None = None,
+    runs_base: Path = Path("runs"),
+) -> RunContext:
+    """
+    Execute the full pipeline.
+    Args:
+        input_path: Path to input directory or single file.
+        config: Optional config overrides.
+        runs_base: Base directory for run outputs.
+    Returns:
+        RunContext with all results.
+    """
+    config = config or {}
+    # 1. Create run directory
+    run_id, run_dir = _create_run_dir(runs_base)
+    logger.info("═══ Pipeline run %s ═══", run_id)
+    logger.info("Run directory: %s", run_dir)
+    # 2. Build input spec
+    input_path = Path(input_path)
+    if not input_path.exists():
+        logger.error("Input path does not exist: %s", input_path)
+        sys.exit(1)
+    input_spec = build_input_spec(input_path)
+    logger.info("Input: %d files from %s", len(input_spec.files), input_spec.input_dir)
+    # 3. Normalize files
+    records = normalize_files(input_spec)
+    logger.info("Normalized: %d records", len(records))
+    # 4. Set up storage
+    db_path = run_dir / "db.sqlite"
+    storage = create_storage(db_path)
+    # 5. Store initial normalized records
+    storage.insert_records(records)
+    # 6. Build run context
+    ctx = RunContext(
+        run_id=run_id,
+        run_dir=run_dir,
+        db_path=db_path,
+        input_spec=input_spec,
+        config=config,
+    )
+    # 7. Discover and register modules
+    registered = discover_and_register()
+    logger.info("Registered stages: %s", registered)
+    # 8. Run each stage
+    stage_outputs: Dict[str, EngineOutput] = {}
+    current_records = records
+    all_artifacts: List[Artifact] = []
+    for stage_name in get_ordered_stages():
+        logger.info("── Stage: %s ──", stage_name)
+        run_fn = get_stage(stage_name)
+        if run_fn is None:
+            logger.warning("No run function for stage '%s' — skipping", stage_name)
+            stage_outputs[stage_name] = EngineOutput(
+                stage=stage_name,
+                status=StageStatus.SKIPPED,
+                summary="No run function found",
+            )
+            continue
+        # Build stage input
+        engine_input = EngineInput(
+            run_id=run_id,
+            input_spec=input_spec,
+            records=current_records,
+            config=config,
+            run_dir=run_dir,
+            previous_artifacts=all_artifacts,
+        )
+        try:
+            output = run_fn(engine_input)
+            if not isinstance(output, EngineOutput):
+                # Wrap legacy return values
+                output = EngineOutput(
+                    stage=stage_name,
+                    status=StageStatus.SUCCESS,
+                    records=current_records,
+                    summary=str(output) if output else "completed",
+                )
+            # Update records if the stage produced new ones
+            if output.records:
+                current_records = output.records
+            stage_outputs[stage_name] = output
+            all_artifacts.extend(output.artifacts)
+            logger.info("  → %s: %s", stage_name, output.status.value)
+            if output.summary:
+                logger.info("    %s", output.summary)
+        except Exception as exc:
+            logger.error("  ✗ %s failed: %s", stage_name, exc, exc_info=True)
+            stage_outputs[stage_name] = EngineOutput(
+                stage=stage_name,
+                status=StageStatus.FAILED,
+                error=str(exc),
+            )
+        # Persist stage result
+        storage.insert_stage_result(stage_outputs[stage_name])
+    # 9. Store final records (may have been enriched by stages)
+    storage.insert_records(current_records)
+    # 10. Generate report
+    logger.info("── Generating report ──")
+    report_artifacts = generate_report(
+        run_id=run_id,
+        run_dir=run_dir,
+        input_spec=input_spec,
+        records=current_records,
+        stage_outputs=stage_outputs,
+    )
+    for a in report_artifacts:
+        storage.insert_artifact(a)
+    all_artifacts.extend(report_artifacts)
+    # 11. Write run manifest
+    manifest = {
+        "run_id": run_id,
+        "run_dir": str(run_dir),
+        "input_path": str(input_path),
+        "total_records": len(current_records),
+        "stages": {
+            name: {"status": out.status.value, "summary": out.summary}
+            for name, out in stage_outputs.items()
+        },
+        "artifacts": [
+            {"name": a.name, "path": str(a.path)}
+            for a in all_artifacts
+        ],
+        "completed_at": datetime.now(timezone.utc).isoformat(),
+    }
+    manifest_path = run_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
+    logger.info("Manifest: %s", manifest_path)
+    # 12. Close storage
+    storage.close()
+    # 13. Summary
+    logger.info("═══ Pipeline complete ═══")
+    logger.info("  Run ID:    %s", run_id)
+    logger.info("  Records:   %d", len(current_records))
+    logger.info("  DB:        %s", db_path)
+    logger.info("  Report:    %s", run_dir / "report" / "index.html")
+    logger.info("  Exports:   %s", run_dir / "exports")
+    ctx.stage_results = stage_outputs
+    return ctx
+# ---------------------------------------------------------------------------
+# CLI entrypoint
+# ---------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="MOD-OSINT Pipeline Orchestrator",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m engine.pipeline_orchestrator --input samples/demo_ingest/
+  python -m engine.pipeline_orchestrator --input data/case_001.csv
+        """,
+    )
+    parser.add_argument(
+        "--input", "-i",
+        required=True,
+        help="Path to input directory or single file",
+    )
+    parser.add_argument(
+        "--config", "-c",
+        default=None,
+        help="Path to JSON config overrides",
+    )
+    parser.add_argument(
+        "--runs-dir",
+        default="runs",
+        help="Base directory for run outputs (default: runs/)",
+    )
+    args = parser.parse_args()
+    config = {}
+    if args.config:
+        config = json.loads(Path(args.config).read_text())
+    run_pipeline(
+        input_path=args.input,
+        config=config,
+        runs_base=Path(args.runs_dir),
+    )
+if __name__ == "__main__":
+    main()

engine/registry.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Module Registry — discovery, registration, and dependency ordering.
+The engine discovers pipeline modules from ``modules/`` (lowercase only).
+Each module that wants to participate in the pipeline must either:
+1. Be listed in ``PIPELINE_STAGES`` (the built-in ordered list), or
+2. Register itself via the ``@pipeline_stage`` decorator.
+The registry enforces that every registered module exposes a
+``run(input: EngineInput) -> EngineOutput`` callable.
+"""
+from __future__ import annotations
+import importlib
+import logging
+from collections import OrderedDict
+from typing import Callable, Dict, List, Optional, Protocol
+from engine.io_contract import EngineInput, EngineOutput
+logger = logging.getLogger("engine.registry")
+# ---------------------------------------------------------------------------
+# Protocol that every pipeline module must satisfy
+# ---------------------------------------------------------------------------
+class PipelineModule(Protocol):
+    """Structural type for a pipeline module."""
+    def run(self, engine_input: EngineInput) -> EngineOutput: ...
+# ---------------------------------------------------------------------------
+# Built-in pipeline stage ordering
+# ---------------------------------------------------------------------------
+PIPELINE_STAGES: List[Dict[str, str]] = [
+    {"name": "ingestion",      "module_path": "modules.ingestion.ingest_data"},
+    {"name": "preprocessing",  "module_path": "modules.preprocessing.preprocess_data"},
+    {"name": "analysis",       "module_path": "modules.ml_analysis.ml_analysis"},
+    {"name": "correlation",    "module_path": "modules.correlation.correlate"},
+    {"name": "export",         "module_path": "modules.export.export_results"},
+]
+# ---------------------------------------------------------------------------
+# Registry singleton
+# ---------------------------------------------------------------------------
+_registry: OrderedDict[str, Callable[[EngineInput], EngineOutput]] = OrderedDict()
+def register(name: str, run_fn: Callable[[EngineInput], EngineOutput]) -> None:
+    """Register a module's run function under *name*."""
+    if name in _registry:
+        logger.warning("Overwriting existing registration for stage '%s'", name)
+    _registry[name] = run_fn
+    logger.info("Registered pipeline stage: %s", name)
+def get_stage(name: str) -> Optional[Callable[[EngineInput], EngineOutput]]:
+    """Return the run function for *name*, or ``None``."""
+    return _registry.get(name)
+def get_ordered_stages() -> List[str]:
+    """Return stage names in pipeline execution order."""
+    return list(_registry.keys())
+def clear() -> None:
+    """Clear all registrations (useful for testing)."""
+    _registry.clear()
+# ---------------------------------------------------------------------------
+# Decorator for ad-hoc registration
+# ---------------------------------------------------------------------------
+def pipeline_stage(name: str):
+    """
+    Decorator to register a function as a pipeline stage::
+        @pipeline_stage("my_stage")
+        def run(engine_input: EngineInput) -> EngineOutput:
+            ...
+    """
+    def decorator(fn: Callable[[EngineInput], EngineOutput]):
+        register(name, fn)
+        return fn
+    return decorator
+# ---------------------------------------------------------------------------
+# Auto-discovery from PIPELINE_STAGES
+# ---------------------------------------------------------------------------
+def discover_and_register() -> List[str]:
+    """
+    Import each module listed in ``PIPELINE_STAGES`` and register its
+    ``run`` function.  Returns the list of successfully registered stage names.
+    Only discovers from ``modules/`` (lowercase).  The uppercase ``Modules/``
+    directory is explicitly excluded.
+    """
+    registered: List[str] = []
+    for stage_def in PIPELINE_STAGES:
+        name = stage_def["name"]
+        module_path = stage_def["module_path"]
+        try:
+            mod = importlib.import_module(module_path)
+            run_fn = getattr(mod, "run", None)
+            if run_fn is None:
+                logger.error(
+                    "Module '%s' (%s) has no run() function — skipping",
+                    name, module_path,
+                )
+                continue
+            register(name, run_fn)
+            registered.append(name)
+        except ImportError as exc:
+            logger.error(
+                "Failed to import module '%s' (%s): %s",
+                name, module_path, exc,
+            )
+        except Exception as exc:
+            logger.error(
+                "Unexpected error loading '%s' (%s): %s",
+                name, module_path, exc,
+            )
+    return registered

engine/reporting.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Reporting — HTML report generation + JSONL/CSV exports.
+Produces:
+    runs/<run_id>/report/index.html   – human-browsable report
+    runs/<run_id>/exports/normalized.jsonl
+    runs/<run_id>/exports/records.csv
+"""
+from __future__ import annotations
+import csv
+import json
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List
+from engine.io_contract import (
+    Artifact,
+    EngineOutput,
+    InputSpec,
+    NormalizedRecord,
+)
+logger = logging.getLogger("engine.reporting")
+# ---------------------------------------------------------------------------
+# Jinja2 setup (lazy import so the module can be imported without jinja2)
+# ---------------------------------------------------------------------------
+_TEMPLATE_DIR = Path(__file__).parent / "templates"
+def _render_html(template_name: str, context: Dict[str, Any]) -> str:
+    """Render a Jinja2 template from the engine/templates/ directory."""
+    try:
+        from jinja2 import Environment, FileSystemLoader
+    except ImportError:
+        logger.warning("jinja2 not installed — HTML report will be a plain summary")
+        return _fallback_html(context)
+    env = Environment(
+        loader=FileSystemLoader(str(_TEMPLATE_DIR)),
+        autoescape=True,
+    )
+    template = env.get_template(template_name)
+    return template.render(**context)
+def _fallback_html(context: Dict[str, Any]) -> str:
+    """Minimal HTML when Jinja2 is unavailable."""
+    return (
+        f"<html><body><h1>MOD-OSINT Report — {context.get('run_id', '?')}</h1>"
+        f"<p>Records: {context.get('total_records', 0)}</p>"
+        f"<p>Generated: {context.get('generated_at', '')}</p>"
+        f"<p><em>Install jinja2 for the full HTML report.</em></p>"
+        f"</body></html>"
+    )
+# ---------------------------------------------------------------------------
+# Export helpers
+# ---------------------------------------------------------------------------
+def export_jsonl(records: List[NormalizedRecord], out_path: Path) -> Path:
+    """Write records as newline-delimited JSON."""
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(r.model_dump_json() + "\n")
+    logger.info("Exported %d records to %s", len(records), out_path)
+    return out_path
+def export_csv(records: List[NormalizedRecord], out_path: Path) -> Path:
+    """Write records as CSV."""
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if not records:
+        out_path.write_text("")
+        return out_path
+    fieldnames = list(records[0].model_dump().keys())
+    with open(out_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in records:
+            row = r.model_dump()
+            # Serialize complex fields
+            for k, v in row.items():
+                if isinstance(v, (dict, list)):
+                    row[k] = json.dumps(v, ensure_ascii=False, default=str)
+                elif isinstance(v, Path):
+                    row[k] = str(v)
+                elif v is None:
+                    row[k] = ""
+            writer.writerow(row)
+    logger.info("Exported %d records to %s", len(records), out_path)
+    return out_path
+# ---------------------------------------------------------------------------
+# Main report generation
+# ---------------------------------------------------------------------------
+def generate_report(
+    run_id: str,
+    run_dir: Path,
+    input_spec: InputSpec,
+    records: List[NormalizedRecord],
+    stage_outputs: Dict[str, EngineOutput],
+) -> List[Artifact]:
+    """
+    Generate the full report suite:
+        - HTML report at ``run_dir/report/index.html``
+        - JSONL export at ``run_dir/exports/normalized.jsonl``
+        - CSV export at ``run_dir/exports/records.csv``
+    Returns a list of ``Artifact`` objects.
+    """
+    report_dir = run_dir / "report"
+    exports_dir = run_dir / "exports"
+    report_dir.mkdir(parents=True, exist_ok=True)
+    exports_dir.mkdir(parents=True, exist_ok=True)
+    artifacts: List[Artifact] = []
+    # -- JSONL export --------------------------------------------------------
+    jsonl_path = export_jsonl(records, exports_dir / "normalized.jsonl")
+    artifacts.append(Artifact(
+        name="normalized.jsonl",
+        path=jsonl_path,
+        mime_type="application/jsonl",
+        description="All normalized records in JSONL format",
+    ))
+    # -- CSV export ----------------------------------------------------------
+    csv_path = export_csv(records, exports_dir / "records.csv")
+    artifacts.append(Artifact(
+        name="records.csv",
+        path=csv_path,
+        mime_type="text/csv",
+        description="All normalized records in CSV format",
+    ))
+    # -- HTML report ---------------------------------------------------------
+    preview_limit = 50
+    stages_data = []
+    for name, out in stage_outputs.items():
+        stages_data.append({
+            "stage": name,
+            "status": out.status.value,
+            "summary": out.summary,
+            "error": out.error,
+        })
+    input_files_data = []
+    for f in input_spec.files:
+        input_files_data.append({
+            "name": f.path.name,
+            "file_type": f.file_type.value,
+            "size_bytes": f.size_bytes,
+            "sha256": f.sha256,
+        })
+    records_preview = []
+    for r in records[:preview_limit]:
+        d = r.model_dump()
+        # Convert Path/enum to string for template
+        d["source_type"] = d.get("source_type", "")
+        if hasattr(d["source_type"], "value"):
+            d["source_type"] = d["source_type"].value
+        records_preview.append(d)
+    # Build relative paths for download links
+    artifacts_data = []
+    for a in artifacts:
+        try:
+            rel = a.path.relative_to(report_dir)
+        except ValueError:
+            rel = Path("..") / "exports" / a.path.name
+        artifacts_data.append({"name": a.name, "rel_path": str(rel)})
+    context = {
+        "run_id": run_id,
+        "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
+        "total_records": len(records),
+        "input_file_count": len(input_spec.files),
+        "stages": stages_data,
+        "input_files": input_files_data,
+        "records_preview": records_preview,
+        "preview_limit": preview_limit,
+        "artifacts": artifacts_data,
+    }
+    html_content = _render_html("report.html", context)
+    html_path = report_dir / "index.html"
+    html_path.write_text(html_content, encoding="utf-8")
+    logger.info("HTML report written to %s", html_path)
+    artifacts.append(Artifact(
+        name="index.html",
+        path=html_path,
+        mime_type="text/html",
+        description="Human-browsable pipeline report",
+    ))
+    return artifacts

engine/storage.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Storage — SQL persistence for pipeline runs.
+Uses SQLite by default (``runs/<run_id>/db.sqlite``).
+Set ``DATABASE_URL`` env var for Postgres (e.g. ``postgresql://user:pass@host/db``).
+Tables:
+    normalized_records – all NormalizedRecord rows
+    stage_results      – per-stage summary + status
+    artifacts          – artifact metadata
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import sqlite3
+from pathlib import Path
+from typing import List, Optional
+from engine.io_contract import Artifact, EngineOutput, NormalizedRecord
+logger = logging.getLogger("engine.storage")
+# ---------------------------------------------------------------------------
+# Schema DDL (SQLite-compatible, works with Postgres too)
+# ---------------------------------------------------------------------------
+_DDL = """
+CREATE TABLE IF NOT EXISTS normalized_records (
+    row_id        TEXT PRIMARY KEY,
+    source_file   TEXT,
+    source_type   TEXT,
+    timestamp     TEXT,
+    entity_name   TEXT,
+    entity_phone  TEXT,
+    entity_email  TEXT,
+    entity_ip     TEXT,
+    entity_domain TEXT,
+    entity_hash   TEXT,
+    raw_text      TEXT,
+    extra         TEXT
+);
+CREATE TABLE IF NOT EXISTS stage_results (
+    stage   TEXT PRIMARY KEY,
+    status  TEXT,
+    summary TEXT,
+    error   TEXT,
+    metadata TEXT
+);
+CREATE TABLE IF NOT EXISTS artifacts (
+    name        TEXT PRIMARY KEY,
+    path        TEXT,
+    mime_type   TEXT,
+    description TEXT
+);
+"""
+# ---------------------------------------------------------------------------
+# Storage backend
+# ---------------------------------------------------------------------------
+class StorageBackend:
+    """
+    Thin wrapper around a SQLite (or Postgres) connection.
+    For this iteration we use raw ``sqlite3``.  A future iteration can
+    swap in SQLAlchemy / SQLModel for Postgres parity.
+    """
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._conn: Optional[sqlite3.Connection] = None
+    # -- lifecycle -----------------------------------------------------------
+    def connect(self) -> None:
+        logger.info("Connecting to SQLite: %s", self.db_path)
+        self._conn = sqlite3.connect(str(self.db_path))
+        self._conn.executescript(_DDL)
+        self._conn.commit()
+    def close(self) -> None:
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+    @property
+    def conn(self) -> sqlite3.Connection:
+        if self._conn is None:
+            self.connect()
+        assert self._conn is not None
+        return self._conn
+    # -- writes --------------------------------------------------------------
+    def insert_records(self, records: List[NormalizedRecord]) -> int:
+        """Insert normalized records.  Returns count inserted."""
+        if not records:
+            return 0
+        sql = """
+            INSERT OR REPLACE INTO normalized_records
+            (row_id, source_file, source_type, timestamp,
+             entity_name, entity_phone, entity_email,
+             entity_ip, entity_domain, entity_hash,
+             raw_text, extra)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """
+        rows = [
+            (
+                r.row_id,
+                r.source_file,
+                r.source_type.value if r.source_type else "",
+                r.timestamp.isoformat() if r.timestamp else None,
+                r.entity_name,
+                r.entity_phone,
+                r.entity_email,
+                r.entity_ip,
+                r.entity_domain,
+                r.entity_hash,
+                r.raw_text,
+                json.dumps(r.extra, ensure_ascii=False, default=str),
+            )
+            for r in records
+        ]
+        self.conn.executemany(sql, rows)
+        self.conn.commit()
+        logger.info("Inserted %d normalized records", len(rows))
+        return len(rows)
+    def insert_stage_result(self, output: EngineOutput) -> None:
+        """Upsert a stage result row."""
+        sql = """
+            INSERT OR REPLACE INTO stage_results
+            (stage, status, summary, error, metadata)
+            VALUES (?, ?, ?, ?, ?)
+        """
+        self.conn.execute(sql, (
+            output.stage,
+            output.status.value,
+            output.summary,
+            output.error,
+            json.dumps(output.metadata, ensure_ascii=False, default=str),
+        ))
+        self.conn.commit()
+    def insert_artifact(self, artifact: Artifact) -> None:
+        """Upsert an artifact metadata row."""
+        sql = """
+            INSERT OR REPLACE INTO artifacts
+            (name, path, mime_type, description)
+            VALUES (?, ?, ?, ?)
+        """
+        self.conn.execute(sql, (
+            artifact.name,
+            str(artifact.path),
+            artifact.mime_type,
+            artifact.description,
+        ))
+        self.conn.commit()
+    # -- reads ---------------------------------------------------------------
+    def count_records(self) -> int:
+        cur = self.conn.execute("SELECT COUNT(*) FROM normalized_records")
+        return cur.fetchone()[0]
+    def fetch_all_records(self) -> List[dict]:
+        """Return all normalized records as dicts."""
+        cur = self.conn.execute("SELECT * FROM normalized_records")
+        cols = [d[0] for d in cur.description]
+        return [dict(zip(cols, row)) for row in cur.fetchall()]
+    def fetch_stage_results(self) -> List[dict]:
+        cur = self.conn.execute("SELECT * FROM stage_results")
+        cols = [d[0] for d in cur.description]
+        return [dict(zip(cols, row)) for row in cur.fetchall()]
+def create_storage(db_path: Path) -> StorageBackend:
+    """Factory: create and connect a StorageBackend."""
+    backend = StorageBackend(db_path)
+    backend.connect()
+    return backend

engine/templates/report.html ADDED Viewed

	@@ -0,0 +1,171 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>MOD-OSINT Report — {{ run_id }}</title>
+    <style>
+        :root {
+            --bg: #0d1117;
+            --surface: #161b22;
+            --border: #30363d;
+            --text: #c9d1d9;
+            --accent: #58a6ff;
+            --success: #3fb950;
+            --error: #f85149;
+            --warn: #d29922;
+        }
+        * { box-sizing: border-box; margin: 0; padding: 0; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
+            background: var(--bg);
+            color: var(--text);
+            padding: 2rem;
+            line-height: 1.6;
+        }
+        h1, h2, h3 { color: var(--accent); margin-bottom: 0.5rem; }
+        h1 { font-size: 1.8rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
+        h2 { font-size: 1.3rem; margin-top: 2rem; }
+        .meta { color: #8b949e; font-size: 0.9rem; margin-bottom: 1.5rem; }
+        .card {
+            background: var(--surface);
+            border: 1px solid var(--border);
+            border-radius: 6px;
+            padding: 1rem 1.2rem;
+            margin-bottom: 1rem;
+        }
+        .badge {
+            display: inline-block;
+            padding: 2px 8px;
+            border-radius: 12px;
+            font-size: 0.8rem;
+            font-weight: 600;
+        }
+        .badge-success { background: var(--success); color: #000; }
+        .badge-failed  { background: var(--error); color: #fff; }
+        .badge-skipped { background: var(--warn); color: #000; }
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 0.5rem;
+            font-size: 0.85rem;
+        }
+        th, td {
+            text-align: left;
+            padding: 6px 10px;
+            border-bottom: 1px solid var(--border);
+        }
+        th { color: var(--accent); font-weight: 600; }
+        tr:hover { background: rgba(88,166,255,0.05); }
+        .truncate { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+        a { color: var(--accent); text-decoration: none; }
+        a:hover { text-decoration: underline; }
+        .downloads { display: flex; gap: 1rem; flex-wrap: wrap; margin-top: 1rem; }
+        .dl-btn {
+            display: inline-block;
+            padding: 8px 16px;
+            background: var(--accent);
+            color: #000;
+            border-radius: 6px;
+            font-weight: 600;
+            font-size: 0.85rem;
+        }
+        .dl-btn:hover { opacity: 0.85; text-decoration: none; }
+    </style>
+</head>
+<body>
+    <h1>🕵️ MOD-OSINT Pipeline Report</h1>
+    <p class="meta">
+        Run ID: <strong>{{ run_id }}</strong> &nbsp;|&nbsp;
+        Generated: <strong>{{ generated_at }}</strong> &nbsp;|&nbsp;
+        Records: <strong>{{ total_records }}</strong> &nbsp;|&nbsp;
+        Input files: <strong>{{ input_file_count }}</strong>
+    </p>
+    <!-- Stage Results -->
+    <h2>Pipeline Stages</h2>
+    {% for stage in stages %}
+    <div class="card">
+        <strong>{{ stage.stage }}</strong>
+        {% if stage.status == "success" %}
+            <span class="badge badge-success">✓ success</span>
+        {% elif stage.status == "failed" %}
+            <span class="badge badge-failed">✗ failed</span>
+        {% else %}
+            <span class="badge badge-skipped">⊘ {{ stage.status }}</span>
+        {% endif %}
+        {% if stage.summary %}
+        <p style="margin-top:0.4rem; font-size:0.9rem;">{{ stage.summary }}</p>
+        {% endif %}
+        {% if stage.error %}
+        <p style="margin-top:0.4rem; color:var(--error); font-size:0.85rem;">Error: {{ stage.error }}</p>
+        {% endif %}
+    </div>
+    {% endfor %}
+    <!-- Input Files -->
+    <h2>Input Files</h2>
+    <div class="card">
+        <table>
+            <thead><tr><th>File</th><th>Type</th><th>Size</th><th>SHA-256</th></tr></thead>
+            <tbody>
+            {% for f in input_files %}
+                <tr>
+                    <td>{{ f.name }}</td>
+                    <td>{{ f.file_type }}</td>
+                    <td>{{ f.size_bytes }} B</td>
+                    <td class="truncate" title="{{ f.sha256 }}">{{ f.sha256[:16] }}…</td>
+                </tr>
+            {% endfor %}
+            </tbody>
+        </table>
+    </div>
+    <!-- Records Preview -->
+    <h2>Normalized Records (first {{ preview_limit }})</h2>
+    <div class="card" style="overflow-x:auto;">
+        <table>
+            <thead>
+                <tr>
+                    <th>row_id</th>
+                    <th>source</th>
+                    <th>type</th>
+                    <th>name</th>
+                    <th>phone</th>
+                    <th>email</th>
+                    <th>ip</th>
+                    <th>domain</th>
+                    <th>raw_text</th>
+                </tr>
+            </thead>
+            <tbody>
+            {% for r in records_preview %}
+                <tr>
+                    <td><code>{{ r.row_id }}</code></td>
+                    <td>{{ r.source_file }}</td>
+                    <td>{{ r.source_type }}</td>
+                    <td>{{ r.entity_name or '' }}</td>
+                    <td>{{ r.entity_phone or '' }}</td>
+                    <td>{{ r.entity_email or '' }}</td>
+                    <td>{{ r.entity_ip or '' }}</td>
+                    <td>{{ r.entity_domain or '' }}</td>
+                    <td class="truncate" title="{{ r.raw_text }}">{{ r.raw_text[:80] }}</td>
+                </tr>
+            {% endfor %}
+            </tbody>
+        </table>
+    </div>
+    <!-- Downloads -->
+    <h2>Downloadable Artifacts</h2>
+    <div class="downloads">
+        {% for a in artifacts %}
+        <a class="dl-btn" href="{{ a.rel_path }}" download>⬇ {{ a.name }}</a>
+        {% endfor %}
+    </div>
+    <p class="meta" style="margin-top:3rem;">
+        Generated by <strong>MOD-OSINT Engine v0.1.0</strong>
+    </p>
+</body>
+</html>

gui/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+MOD-OSINT GUI package.
+Streamlit entrypoint: gui/streamlit_app.py
+"""

gui/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+MOD-OSINT Streamlit GUI Wizard
+Wired to engine.pipeline_orchestrator.run_pipeline()
+Stages:
+  A — Upload / Input selection
+  B — Settings
+  C — Run pipeline
+  D — Browse / Export results
+Import safety:
+  This module avoids importing Streamlit at module load time so CI/tests can
+  import it without ScriptRunContext warnings.
+"""
+from __future__ import annotations
+import sqlite3
+import tempfile
+from pathlib import Path
+import pandas as pd
+_DEMO_DIR = Path("samples/demo_ingest")
+def _load_yaml_defaults(path: Path) -> dict:
+    try:
+        import yaml  # optional; provided by requirements-hf.txt
+        return yaml.safe_load(path.read_text()) or {}
+    except Exception:
+        return {}
+def _write_uploads(uploads) -> Path:
+    """Save uploaded files into a temp dir and return the dir path."""
+    tmp = Path(tempfile.mkdtemp(prefix="modosint_"))
+    updir = tmp / "uploads"
+    updir.mkdir(parents=True, exist_ok=True)
+    for file_obj in uploads:
+        (updir / file_obj.name).write_bytes(file_obj.getbuffer())
+    return updir
+def _resolve_input(session_state) -> Path | None:
+    """Determine input from session state (uploads > local path > demo)."""
+    uploads = session_state.get("_uploads")
+    if uploads:
+        return _write_uploads(uploads)
+    local_path = session_state.get("_local_path", "").strip()
+    if local_path:
+        path_obj = Path(local_path).expanduser()
+        if path_obj.exists():
+            return path_obj
+    if session_state.get("_use_demo") and _DEMO_DIR.exists():
+        return _DEMO_DIR
+    return None
+def main() -> None:
+    """Entrypoint for `streamlit run gui/streamlit_app.py`."""
+    import streamlit as st
+    import streamlit.components.v1 as components
+    from engine.pipeline_orchestrator import run_pipeline
+    from gui.terminal_panel import render_terminal
+    st.set_page_config(
+        page_title="MOD-OSINT",
+        page_icon="🧠",
+        layout="wide",
+        initial_sidebar_state="expanded",
+    )
+    st.title("🧠 MOD-OSINT")
+    st.caption("GUI wizard -> `engine.pipeline_orchestrator.run_pipeline()`")
+    if "effective_config" not in st.session_state:
+        st.session_state["effective_config"] = {}
+    if "last_run_id" not in st.session_state:
+        st.session_state["last_run_id"] = None
+    if "last_run_dir" not in st.session_state:
+        st.session_state["last_run_dir"] = None
+    with st.sidebar:
+        render_terminal({"effective_config": st.session_state["effective_config"]})
+    tab_upload, tab_settings, tab_run, tab_browse = st.tabs(
+        ["📂 Upload", "⚙️ Settings", "▶️ Run", "📊 Browse"]
+    )
+    with tab_upload:
+        st.subheader("A) Upload or select input")
+        uploads = st.file_uploader(
+            "Upload files (CSV, JSON, TXT, HTML, LOG)",
+            accept_multiple_files=True,
+            key="_uploads",
+        )
+        if uploads:
+            st.success(f"Queued {len(uploads)} file(s): {[u.name for u in uploads]}")
+        st.divider()
+        local_path = st.text_input(
+            "Or enter a local directory / file path",
+            value="",
+            key="_local_path",
+            placeholder="/path/to/data/",
+        )
+        st.divider()
+        st.checkbox(
+            f"Use built-in demo dataset (`{_DEMO_DIR}`)",
+            value=not bool(uploads) and not bool(local_path),
+            key="_use_demo",
+            disabled=not _DEMO_DIR.exists(),
+            help="Runs the pipeline against samples/demo_ingest/ for quick smoke testing.",
+        )
+        if _DEMO_DIR.exists():
+            demo_files = sorted(_DEMO_DIR.iterdir())
+            st.caption(f"Demo files: {[f.name for f in demo_files if f.is_file()]}")
+        else:
+            st.caption("`samples/demo_ingest/` not found in working directory.")
+    with tab_settings:
+        st.subheader("B) Pipeline settings")
+        cfg_path = Path("pipeline_config.yaml")
+        defaults = _load_yaml_defaults(cfg_path) if cfg_path.exists() else {}
+        col_left, col_right = st.columns(2)
+        with col_left:
+            offline_mode = st.toggle(
+                "offline_mode",
+                value=True,
+                help="Disable all outbound network calls.",
+            )
+            enable_ml = st.toggle(
+                "enable_ml_analysis",
+                value=False,
+                help="Enable ML/NLP stage (requires torch; off by default).",
+            )
+        with col_right:
+            correlation_mode = st.selectbox(
+                "correlation_mode",
+                ["basic", "in-memory"],
+                index=0,
+                help="basic = simple entity matching; in-memory = graph in RAM.",
+            )
+        effective_config: dict = defaults.copy()
+        effective_config.setdefault("runtime", {})
+        effective_config["runtime"].update(
+            {
+                "offline_mode": offline_mode,
+                "enable_ml_analysis": enable_ml,
+                "correlation_mode": correlation_mode,
+            }
+        )
+        st.session_state["effective_config"] = effective_config
+        st.markdown("**Effective config (passed to engine):**")
+        st.json(effective_config)
+    with tab_run:
+        st.subheader("C) Run pipeline")
+        st.caption("Outputs are written to `runs/<run_id>/`.")
+        input_path = _resolve_input(st.session_state)
+        if input_path:
+            st.info(f"Input resolved -> `{input_path}`")
+        else:
+            st.warning("No input selected. Go to Upload tab or enable demo dataset.")
+        run_btn = st.button("🚀 Run pipeline now", type="primary", disabled=input_path is None)
+        if run_btn and input_path:
+            progress = st.progress(0, text="Starting...")
+            log_area = st.empty()
+            log_lines: list[str] = []
+            def _log(message: str) -> None:
+                log_lines.append(message)
+                log_area.code("\n".join(log_lines[-40:]), language="bash")
+            _log(f"Input: {input_path}")
+            _log("Calling engine.pipeline_orchestrator.run_pipeline()...")
+            progress.progress(10, text="Normalizing files...")
+            try:
+                ctx = run_pipeline(
+                    input_path=input_path,
+                    config=st.session_state["effective_config"],
+                )
+                st.session_state["last_run_id"] = ctx.run_id
+                st.session_state["last_run_dir"] = str(ctx.run_dir)
+                progress.progress(90, text="Generating report...")
+                _log(f"Run ID:  {ctx.run_id}")
+                _log(f"Run dir: {ctx.run_dir}")
+                if ctx.stage_results:
+                    for stage_name, stage_out in ctx.stage_results.items():
+                        _log(f"  [{stage_out.status.value.upper():8s}] {stage_name}")
+                progress.progress(100, text="Done")
+                st.success(f"Pipeline complete - run `{ctx.run_id}`")
+                st.code(str(ctx.run_dir))
+                st.info("Switch to Browse tab to explore outputs.")
+            except Exception as exc:
+                progress.empty()
+                st.error(f"Pipeline failed: {exc}")
+                _log(f"ERROR: {exc}")
+    with tab_browse:
+        st.subheader("D) Browse results")
+        run_dir_str = st.session_state.get("last_run_dir")
+        if not run_dir_str:
+            st.info("Run the pipeline first (Stage C).")
+            return
+        run_dir = Path(run_dir_str)
+        report_html = run_dir / "report" / "index.html"
+        db_path = run_dir / "db.sqlite"
+        exports_dir = run_dir / "exports"
+        manifest_path = run_dir / "manifest.json"
+        col1, col2, col3, col4 = st.columns(4)
+        col1.metric("Run ID", st.session_state.get("last_run_id", "-"))
+        col2.metric("Report", "yes" if report_html.exists() else "no")
+        col3.metric("DB", "yes" if db_path.exists() else "no")
+        col4.metric("Exports", str(len(list(exports_dir.rglob("*"))) if exports_dir.exists() else 0))
+        if manifest_path.exists():
+            with st.expander("Run manifest"):
+                import json
+                st.json(json.loads(manifest_path.read_text()))
+        st.divider()
+        st.markdown("### HTML Report")
+        if report_html.exists():
+            st.markdown(f"`{report_html}`")
+            try:
+                components.html(report_html.read_text(errors="replace"), height=700, scrolling=True)
+            except Exception as exc:
+                st.warning(f"Inline render failed ({exc}). Open the path above in a browser.")
+            with open(report_html, "rb") as file_handle:
+                st.download_button(
+                    "Download report/index.html",
+                    data=file_handle,
+                    file_name="index.html",
+                    mime="text/html",
+                )
+        else:
+            st.info("No report/index.html yet.")
+        st.divider()
+        st.markdown("### Exports")
+        if exports_dir.exists():
+            export_files = sorted([path for path in exports_dir.rglob("*") if path.is_file()])
+            if export_files:
+                for export_file in export_files:
+                    rel = export_file.relative_to(run_dir).as_posix()
+                    col_path, col_download = st.columns([3, 1])
+                    col_path.write(f"`{rel}`")
+                    with open(export_file, "rb") as file_handle:
+                        col_download.download_button(
+                            "Download",
+                            data=file_handle,
+                            file_name=export_file.name,
+                            key=f"dl_{rel}",
+                        )
+            else:
+                st.info("Exports directory is empty.")
+        else:
+            st.info("No exports/ directory found.")
+        jsonl_path = run_dir / "normalized.jsonl"
+        if jsonl_path.exists():
+            with open(jsonl_path, "rb") as file_handle:
+                st.download_button(
+                    "Download normalized.jsonl",
+                    data=file_handle,
+                    file_name="normalized.jsonl",
+                    mime="application/x-ndjson",
+                )
+        st.divider()
+        st.markdown("### SQLite DB Preview")
+        if not db_path.exists():
+            st.info("No db.sqlite found.")
+            return
+        with open(db_path, "rb") as file_handle:
+            st.download_button(
+                "Download db.sqlite",
+                data=file_handle,
+                file_name="db.sqlite",
+                mime="application/x-sqlite3",
+            )
+        try:
+            conn = sqlite3.connect(db_path)
+            tables = pd.read_sql(
+                "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
+                conn,
+            )["name"].tolist()
+            if tables:
+                st.write("Tables:", tables)
+                selected_table = st.selectbox("Preview table", tables, key="db_table_sel")
+                dataframe = pd.read_sql(f"SELECT * FROM [{selected_table}] LIMIT 200;", conn)
+                st.dataframe(dataframe, use_container_width=True)
+            else:
+                st.info("DB exists but contains no tables yet.")
+            conn.close()
+        except Exception as exc:
+            st.warning(f"DB preview failed: {exc}")
+if __name__ == "__main__":
+    main()

gui/terminal_panel.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+ALLOWED = {
+    "help",
+    "show config",
+    "list runs",
+    "open last report",
+    "tail log",
+    "run pipeline",
+}
+def _st():
+    """Import streamlit lazily so module import stays side-effect free."""
+    import streamlit as st
+    return st
+def _runs_dir() -> Path:
+    return Path("runs")
+def _list_runs(limit: int = 25) -> list[Path]:
+    runs_dir = _runs_dir()
+    if not runs_dir.exists():
+        return []
+    runs = [path for path in runs_dir.iterdir() if path.is_dir()]
+    runs.sort(key=lambda path: path.name, reverse=True)
+    return runs[:limit]
+def _tail(path: Path, n: int = 200) -> str:
+    try:
+        lines = path.read_text(errors="replace").splitlines()
+        return "\n".join(lines[-n:])
+    except Exception as exc:
+        return f"[error] {exc}"
+def render_terminal(state: dict) -> None:
+    st = _st()
+    st.subheader("Terminal (safe mock)")
+    st.caption("Allowed: help | show config | list runs | open last report | tail log | run pipeline")
+    out = st.session_state.get("_term_out", "")
+    cmd = st.text_input("Command", key="_term_cmd")
+    col1, col2 = st.columns([1, 1])
+    run = col1.button("Run", use_container_width=True)
+    clear = col2.button("Clear", use_container_width=True)
+    if clear:
+        st.session_state["_term_out"] = ""
+        st.rerun()
+    if run and cmd:
+        cmd_norm = cmd.strip().lower()
+        if cmd_norm not in ALLOWED:
+            out += f"\n$ {cmd}\n[blocked] command not allowed\n"
+        else:
+            out += f"\n$ {cmd}\n"
+            if cmd_norm == "help":
+                out += "help | show config | list runs | open last report | tail log | run pipeline\n"
+            elif cmd_norm == "show config":
+                out += json.dumps(state.get("effective_config", {}), indent=2) + "\n"
+            elif cmd_norm == "list runs":
+                runs = _list_runs()
+                out += "\n".join([path.name for path in runs]) + ("\n" if runs else "[none]\n")
+            elif cmd_norm == "open last report":
+                runs = _list_runs(1)
+                if not runs:
+                    out += "[none]\n"
+                else:
+                    report = runs[0] / "report" / "index.html"
+                    out += f"{report.as_posix()}\n"
+            elif cmd_norm == "tail log":
+                runs = _list_runs(1)
+                if not runs:
+                    out += "[none]\n"
+                else:
+                    log_path = runs[0] / "pipeline.log"
+                    out += _tail(log_path) + "\n"
+            elif cmd_norm == "run pipeline":
+                input_path = None
+                uploads = st.session_state.get("_uploads")
+                if uploads:
+                    import tempfile
+                    tmp = Path(tempfile.mkdtemp(prefix="modosint_term_"))
+                    updir = tmp / "uploads"
+                    updir.mkdir(parents=True, exist_ok=True)
+                    for file_obj in uploads:
+                        (updir / file_obj.name).write_bytes(file_obj.getbuffer())
+                    input_path = updir
+                elif st.session_state.get("_local_path", "").strip():
+                    path_obj = Path(st.session_state["_local_path"].strip()).expanduser()
+                    if path_obj.exists():
+                        input_path = path_obj
+                elif Path("samples/demo_ingest").exists():
+                    input_path = Path("samples/demo_ingest")
+                if input_path is None:
+                    out += "[error] no input available; upload files or enable demo dataset first\n"
+                else:
+                    try:
+                        from engine.pipeline_orchestrator import run_pipeline
+                        ctx = run_pipeline(
+                            input_path=input_path,
+                            config=state.get("effective_config", {}),
+                        )
+                        st.session_state["last_run_id"] = ctx.run_id
+                        st.session_state["last_run_dir"] = str(ctx.run_dir)
+                        out += f"run_id:  {ctx.run_id}\nrun_dir: {ctx.run_dir}\n"
+                        for stage_name, stage_out in ctx.stage_results.items():
+                            out += f"  [{stage_out.status.value.upper():8s}] {stage_name}\n"
+                    except Exception as exc:
+                        out += f"[error] {exc}\n"
+        st.session_state["_term_out"] = out
+        st.rerun()
+    st.text_area("Output", value=st.session_state.get("_term_out", ""), height=260)

modules/README.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# MODULES Directory
+## Purpose:
+This directory houses the functional modules responsible for specific phases of the OSINT pipeline. Each subfolder represents a stage of processing from data ingestion to export.
+## Subdirectories:
+- `ingestion/`: Connects to external data sources (web scrapers, APIs, sensors).
+- `preprocessing/`: Cleans, deduplicates, and normalizes raw data into structured input.
+- `ml_analysis/`: Applies machine learning models (e.g., classification, clustering, NLP).
+- `correlation/`: Cross-references data (e.g., STIX/IOC matching, pattern detection).
+- `export/`: Packages output into files, databases, APIs, or dashboards.
+## Design Philosophy:
+Each module is atomic, reusable, and accepts standardized JSON inputs/outputs. Naming follows functional role, and new tools should be added under appropriate phase.

modules/__init__.py ADDED Viewed

File without changes

modules/correlation/correlate.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Correlation module — entity linking and cross-record correlation.
+Engine contract:
+    run(EngineInput) -> EngineOutput
+Groups records by shared entity keys (email, IP, phone, domain, hash)
+and produces correlation metadata.  No external dependencies (Neo4j
+is optional and handled separately).
+"""
+from __future__ import annotations
+import logging
+from collections import defaultdict
+from typing import Any, Dict, List, Set
+from engine.io_contract import (
+    EngineInput,
+    EngineOutput,
+    NormalizedRecord,
+    StageStatus,
+)
+logger = logging.getLogger("modules.correlation")
+def _build_entity_index(records: List[NormalizedRecord]) -> Dict[str, List[str]]:
+    """
+    Build an inverted index: entity_value → [row_ids].
+    """
+    index: Dict[str, List[str]] = defaultdict(list)
+    for r in records:
+        for field in ("entity_email", "entity_ip", "entity_phone",
+                       "entity_domain", "entity_hash"):
+            val = getattr(r, field, None)
+            if val:
+                key = f"{field}:{val}"
+                index[key].append(r.row_id)
+    return dict(index)
+def _find_clusters(index: Dict[str, List[str]]) -> List[Set[str]]:
+    """
+    Find clusters of row_ids that share at least one entity value.
+    Uses simple union-find.
+    """
+    parent: Dict[str, str] = {}
+    def find(x: str) -> str:
+        while parent.get(x, x) != x:
+            parent[x] = parent.get(parent[x], parent[x])
+            x = parent[x]
+        return x
+    def union(a: str, b: str) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+    for entity_key, row_ids in index.items():
+        if len(row_ids) > 1:
+            first = row_ids[0]
+            for rid in row_ids[1:]:
+                union(first, rid)
+    clusters: Dict[str, Set[str]] = defaultdict(set)
+    all_ids = set()
+    for row_ids in index.values():
+        all_ids.update(row_ids)
+    for rid in all_ids:
+        root = find(rid)
+        clusters[root].add(rid)
+    # Only return clusters with 2+ members
+    return [c for c in clusters.values() if len(c) > 1]
+def run(engine_input: EngineInput) -> EngineOutput:
+    """
+    Correlate records by shared entity values.
+    """
+    try:
+        records = engine_input.records
+        index = _build_entity_index(records)
+        clusters = _find_clusters(index)
+        # Annotate records with cluster IDs
+        row_to_cluster: Dict[str, int] = {}
+        for i, cluster in enumerate(clusters):
+            for rid in cluster:
+                row_to_cluster[rid] = i
+        annotated: List[NormalizedRecord] = []
+        for r in records:
+            cluster_id = row_to_cluster.get(r.row_id)
+            if cluster_id is not None:
+                extra = dict(r.extra)
+                extra["correlation_cluster"] = cluster_id
+                annotated.append(r.model_copy(update={"extra": extra}))
+            else:
+                annotated.append(r)
+        correlated_count = len(row_to_cluster)
+        return EngineOutput(
+            stage="correlation",
+            status=StageStatus.SUCCESS,
+            records=annotated,
+            summary=f"Found {len(clusters)} clusters linking {correlated_count} records",
+            metadata={
+                "cluster_count": len(clusters),
+                "correlated_records": correlated_count,
+                "entity_index_size": len(index),
+            },
+        )
+    except Exception as exc:
+        logger.error("Correlation failed: %s", exc, exc_info=True)
+        return EngineOutput(
+            stage="correlation",
+            status=StageStatus.FAILED,
+            error=str(exc),
+        )
+if __name__ == "__main__":
+    print("Correlation module — use via engine pipeline")

modules/correlation/correlate_ioc.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import argparse
+import json
+from schemas.osint_module_io import OSINTModuleInput, OSINTModuleOutput
+def run(input_path: str) -> OSINTModuleOutput:
+    """
+    Correlation phase:
+    - Loads multiple module outputs if needed
+    - Cross-references IOCs/IOAs (MISP, Shodan, honeypots)
+    - Clusters related events/patterns
+    """
+    cfg = OSINTModuleInput.parse_file(input_path)
+    summary = "Correlation complete"
+    indicators = {}
+    confidence = 0.0
+    return OSINTModuleOutput(summary=summary, indicators=indicators, confidence=confidence)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the Correlation module")
+    parser.add_argument("input", help="Path to OSINTModuleInput JSON")
+    args = parser.parse_args()
+    out = run(args.input)
+    print(out.json())

modules/export/export_results.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Export module — writes final artifacts (JSON summary, per-stage outputs).
+Engine contract:
+    run(EngineInput) -> EngineOutput
+The main report generation (HTML, CSV, JSONL) is handled by
+``engine.reporting``.  This module produces a supplementary JSON
+summary artifact in the run directory.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict
+from engine.io_contract import (
+    Artifact,
+    EngineInput,
+    EngineOutput,
+    StageStatus,
+)
+logger = logging.getLogger("modules.export")
+def run(engine_input: EngineInput) -> EngineOutput:
+    """
+    Export a JSON summary of the pipeline run.
+    """
+    try:
+        run_dir = Path(engine_input.run_dir)
+        exports_dir = run_dir / "exports"
+        exports_dir.mkdir(parents=True, exist_ok=True)
+        # Build summary
+        summary_data = {
+            "run_id": engine_input.run_id,
+            "total_records": len(engine_input.records),
+            "input_files": [
+                {
+                    "name": f.path.name,
+                    "type": f.file_type.value,
+                    "size": f.size_bytes,
+                }
+                for f in engine_input.input_spec.files
+            ],
+            "record_sample": [
+                {
+                    "row_id": r.row_id,
+                    "source": r.source_file,
+                    "entity_name": r.entity_name,
+                    "entity_email": r.entity_email,
+                    "entity_ip": r.entity_ip,
+                }
+                for r in engine_input.records[:20]
+            ],
+        }
+        out_path = exports_dir / "summary.json"
+        out_path.write_text(
+            json.dumps(summary_data, indent=2, ensure_ascii=False, default=str),
+            encoding="utf-8",
+        )
+        logger.info("Exported summary to %s", out_path)
+        return EngineOutput(
+            stage="export",
+            status=StageStatus.SUCCESS,
+            records=engine_input.records,  # pass through
+            artifacts=[
+                Artifact(
+                    name="summary.json",
+                    path=out_path,
+                    mime_type="application/json",
+                    description="Pipeline run summary",
+                )
+            ],
+            summary=f"Exported summary.json ({len(engine_input.records)} records)",
+        )
+    except Exception as exc:
+        logger.error("Export failed: %s", exc, exc_info=True)
+        return EngineOutput(
+            stage="export",
+            status=StageStatus.FAILED,
+            error=str(exc),
+        )
+# ---------------------------------------------------------------------------
+# Legacy compatibility
+# ---------------------------------------------------------------------------
+def export(data: Any, outpath: str) -> str:
+    """Legacy wrapper (deprecated). Use ``run()`` instead."""
+    with open(outpath, "w") as f:
+        f.write(str(data))
+    return outpath
+if __name__ == "__main__":
+    print(export({"result": 123}, "output.txt"))

modules/ingestion/__init__.py ADDED Viewed

File without changes

modules/ingestion/gathering/web_scraper.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def scrape(url):
+    # Placeholder: insert scraping logic
+    return {"url": url, "data": "scraped_content"}
+if __name__ == "__main__":
+    print(scrape("https://example.com"))

modules/ingestion/ingest_data.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Ingestion module — reads input files and produces NormalizedRecords.
+Engine contract:
+    run(EngineInput) -> EngineOutput
+The heavy lifting (file parsing, entity extraction) is delegated to
+``engine.normalize``.  This module acts as the pipeline-stage wrapper.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Dict
+from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
+from engine.normalize import normalize_files
+logger = logging.getLogger("modules.ingestion")
+def run(engine_input: EngineInput) -> EngineOutput:
+    """
+    Ingest files described by ``engine_input.input_spec`` and return
+    normalized records.
+    """
+    try:
+        records = normalize_files(engine_input.input_spec)
+        return EngineOutput(
+            stage="ingestion",
+            status=StageStatus.SUCCESS,
+            records=records,
+            summary=f"Ingested {len(records)} records from {len(engine_input.input_spec.files)} files",
+        )
+    except Exception as exc:
+        logger.error("Ingestion failed: %s", exc, exc_info=True)
+        return EngineOutput(
+            stage="ingestion",
+            status=StageStatus.FAILED,
+            error=str(exc),
+        )
+# ---------------------------------------------------------------------------
+# Legacy compatibility — keep old function signature working
+# ---------------------------------------------------------------------------
+def ingest(file_path: str) -> Dict[str, Any]:
+    """Legacy wrapper (deprecated). Use ``run()`` instead."""
+    return {"file": file_path, "status": "ingested"}
+if __name__ == "__main__":
+    print(ingest("input.txt"))

modules/ml_analysis/ml_analysis.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+ML Analysis module — entity extraction and text classification.
+Engine contract:
+    run(EngineInput) -> EngineOutput
+This is a lightweight placeholder that performs regex-based entity
+extraction.  When ML dependencies (torch, transformers, spacy) are
+available, it can be extended to use real models.
+"""
+from __future__ import annotations
+import logging
+import re
+from typing import Any, Dict, List, Optional
+from engine.io_contract import (
+    EngineInput,
+    EngineOutput,
+    NormalizedRecord,
+    StageStatus,
+)
+logger = logging.getLogger("modules.ml_analysis")
+# ---------------------------------------------------------------------------
+# Lightweight entity extraction (no ML deps required)
+# ---------------------------------------------------------------------------
+_EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
+_PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
+_DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
+_HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")
+def _enrich_record(record: NormalizedRecord) -> NormalizedRecord:
+    """
+    Enrich a record by extracting entities from raw_text if not already set.
+    """
+    text = record.raw_text
+    updates: Dict[str, Any] = {}
+    if not record.entity_email:
+        m = _EMAIL_RE.search(text)
+        if m:
+            updates["entity_email"] = m.group(0).lower()
+    if not record.entity_ip:
+        m = _IP_RE.search(text)
+        if m:
+            updates["entity_ip"] = m.group(0)
+    if not record.entity_phone:
+        m = _PHONE_RE.search(text)
+        if m:
+            updates["entity_phone"] = m.group(0)
+    if not record.entity_domain:
+        m = _DOMAIN_RE.search(text)
+        if m:
+            updates["entity_domain"] = m.group(0).lower()
+    if not record.entity_hash:
+        m = _HASH_RE.search(text)
+        if m:
+            updates["entity_hash"] = m.group(0).lower()
+    if updates:
+        return record.model_copy(update=updates)
+    return record
+# ---------------------------------------------------------------------------
+# Engine contract
+# ---------------------------------------------------------------------------
+def run(engine_input: EngineInput) -> EngineOutput:
+    """
+    Run ML analysis / entity enrichment on all records.
+    """
+    try:
+        enriched: List[NormalizedRecord] = []
+        enrichment_count = 0
+        for record in engine_input.records:
+            new_record = _enrich_record(record)
+            if new_record is not record:
+                enrichment_count += 1
+            enriched.append(new_record)
+        return EngineOutput(
+            stage="analysis",
+            status=StageStatus.SUCCESS,
+            records=enriched,
+            summary=f"Analyzed {len(enriched)} records, enriched {enrichment_count}",
+            metadata={"enriched_count": enrichment_count},
+        )
+    except Exception as exc:
+        logger.error("ML analysis failed: %s", exc, exc_info=True)
+        return EngineOutput(
+            stage="analysis",
+            status=StageStatus.FAILED,
+            error=str(exc),
+        )
+# ---------------------------------------------------------------------------
+# Legacy compatibility
+# ---------------------------------------------------------------------------
+def analyze(data: Any) -> Dict[str, Any]:
+    """Legacy wrapper (deprecated). Use ``run()`` instead."""
+    return {"input": data, "prediction": "none"}
+if __name__ == "__main__":
+    print(analyze("test"))

modules/preprocessing/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Preprocessing module — cleans and normalizes raw text in records.
+Engine contract:
+    run(EngineInput) -> EngineOutput
+Applies basic text cleaning to each record's ``raw_text`` field:
+strip whitespace, normalize unicode, collapse whitespace runs.
+"""
+from __future__ import annotations
+import logging
+import re
+import unicodedata
+from typing import Any, Dict, List
+from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
+logger = logging.getLogger("modules.preprocessing")
+def _clean_text(text: str) -> str:
+    """Basic text normalization."""
+    # Unicode NFKC normalization
+    text = unicodedata.normalize("NFKC", text)
+    # Strip leading/trailing whitespace
+    text = text.strip()
+    # Collapse multiple whitespace to single space
+    text = re.sub(r"\s+", " ", text)
+    return text
+def run(engine_input: EngineInput) -> EngineOutput:
+    """
+    Preprocess all records: clean ``raw_text``, normalize entity fields.
+    """
+    try:
+        cleaned: List[NormalizedRecord] = []
+        for record in engine_input.records:
+            # Create a copy with cleaned text
+            updated = record.model_copy(update={
+                "raw_text": _clean_text(record.raw_text),
+                "entity_name": record.entity_name.strip() if record.entity_name else None,
+                "entity_email": record.entity_email.strip().lower() if record.entity_email else None,
+                "entity_domain": record.entity_domain.strip().lower() if record.entity_domain else None,
+            })
+            cleaned.append(updated)
+        return EngineOutput(
+            stage="preprocessing",
+            status=StageStatus.SUCCESS,
+            records=cleaned,
+            summary=f"Preprocessed {len(cleaned)} records",
+        )
+    except Exception as exc:
+        logger.error("Preprocessing failed: %s", exc, exc_info=True)
+        return EngineOutput(
+            stage="preprocessing",
+            status=StageStatus.FAILED,
+            error=str(exc),
+        )
+# ---------------------------------------------------------------------------
+# Legacy compatibility
+# ---------------------------------------------------------------------------
+def preprocess(text: str) -> str:
+    """Legacy wrapper (deprecated). Use ``run()`` instead."""
+    return text.strip().lower()
+if __name__ == "__main__":
+    print(preprocess("  This is RAW DATA.  "))

pipeline_config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+ai:
+  mode: offline
+  provider: local_gguf
+  fallbacks:
+    - local_gguf
+    - torch_tfidf
+  gguf_path: models/gguf/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+  n_ctx: 2048
+  n_threads: 8
+  n_gpu_layers: 0
+  temperature: 0.2
+  top_p: 0.95
+  max_tokens: 256
+  remote:
+    enabled: false

requirements-hf.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# MOD-OSINT — Hugging Face Docker Space runtime requirements
+# Intentionally lean: no torch, no llama-cpp, no heavy ML.
+# ML analysis is toggled OFF by default in the GUI.
+# GUI
+streamlit>=1.35.0
+# Data handling
+pandas>=2.0.0
+pyyaml>=6.0.2
+# Engine core
+pydantic>=2.10.0
+jinja2>=3.1.4
+# Security
+certifi>=2024.12.14

samples/demo_ingest/example.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name,phone
2	+ John,8105551212

samples/demo_ingest/example.html ADDED Viewed

	@@ -0,0 +1 @@


1	+ <html><body>demo html</body></html>

samples/demo_ingest/example.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"note":"demo"}

samples/demo_ingest/example.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ hello beta

scripts/docker_entrypoint.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env bash
+# MOD-OSINT Docker entrypoint — startup self-check + Streamlit launch
+set -euo pipefail
+PORT="${STREAMLIT_PORT:-7860}"
+HOST="${STREAMLIT_HOST:-0.0.0.0}"
+if ! [[ "$PORT" =~ ^[0-9]+$ ]] || [ "$PORT" -lt 1 ] || [ "$PORT" -gt 65535 ]; then
+  echo "[fatal] Invalid STREAMLIT_PORT: ${PORT}" >&2
+  exit 1
+fi
+required_files=(
+  "gui/streamlit_app.py"
+  "engine/pipeline_orchestrator.py"
+  "requirements-hf.txt"
+)
+for file_path in "${required_files[@]}"; do
+  if [ ! -f "$file_path" ]; then
+    echo "[fatal] Missing required file: $file_path" >&2
+    exit 1
+  fi
+done
+for dir_path in runs logs; do
+  mkdir -p "$dir_path"
+  if [ ! -w "$dir_path" ]; then
+    echo "[fatal] Directory not writable: $dir_path" >&2
+    exit 1
+  fi
+done
+python3 - <<'PY'
+import importlib
+import sys
+print(f"[startup] Python: {sys.version.split()[0]}")
+for mod in ("streamlit", "pandas", "yaml"):
+    importlib.import_module(mod)
+print("[startup] Dependency import check: OK")
+PY
+echo "════════════════════════════════════════════════════════"
+echo "  MOD-OSINT  |  Streamlit GUI"
+echo "  Binding:    ${HOST}:${PORT}"
+echo "  Health URL: http://localhost:${PORT}/_stcore/health"
+echo "════════════════════════════════════════════════════════"
+if [ "${MODOSINT_STARTUP_CHECK_ONLY:-0}" = "1" ]; then
+  echo "[startup] MODOSINT_STARTUP_CHECK_ONLY=1 -> exiting after checks"
+  exit 0
+fi
+exec streamlit run gui/streamlit_app.py \
+    --server.address="${HOST}" \
+    --server.port="${PORT}" \
+    --server.enableCORS=false \
+    --server.enableXsrfProtection=false \
+    --server.headless=true \
+    --browser.gatherUsageStats=false