Spaces:

moddux
/

mod-osint

Runtime error

File size: 11,454 Bytes

b75c637

"""
MOD-OSINT Streamlit GUI Wizard
Wired to engine.pipeline_orchestrator.run_pipeline()

Stages:
  A — Upload / Input selection
  B — Settings
  C — Run pipeline
  D — Browse / Export results

Import safety:
  This module avoids importing Streamlit at module load time so CI/tests can
  import it without ScriptRunContext warnings.
"""
from __future__ import annotations

import sqlite3
import tempfile
from pathlib import Path

import pandas as pd

_DEMO_DIR = Path("samples/demo_ingest")


def _load_yaml_defaults(path: Path) -> dict:
    try:
        import yaml  # optional; provided by requirements-hf.txt
        return yaml.safe_load(path.read_text()) or {}
    except Exception:
        return {}


def _write_uploads(uploads) -> Path:
    """Save uploaded files into a temp dir and return the dir path."""
    tmp = Path(tempfile.mkdtemp(prefix="modosint_"))
    updir = tmp / "uploads"
    updir.mkdir(parents=True, exist_ok=True)
    for file_obj in uploads:
        (updir / file_obj.name).write_bytes(file_obj.getbuffer())
    return updir


def _resolve_input(session_state) -> Path | None:
    """Determine input from session state (uploads > local path > demo)."""
    uploads = session_state.get("_uploads")
    if uploads:
        return _write_uploads(uploads)

    local_path = session_state.get("_local_path", "").strip()
    if local_path:
        path_obj = Path(local_path).expanduser()
        if path_obj.exists():
            return path_obj

    if session_state.get("_use_demo") and _DEMO_DIR.exists():
        return _DEMO_DIR

    return None


def main() -> None:
    """Entrypoint for `streamlit run gui/streamlit_app.py`."""
    import streamlit as st
    import streamlit.components.v1 as components

    from engine.pipeline_orchestrator import run_pipeline
    from gui.terminal_panel import render_terminal

    st.set_page_config(
        page_title="MOD-OSINT",
        page_icon="🧠",
        layout="wide",
        initial_sidebar_state="expanded",
    )

    st.title("🧠 MOD-OSINT")
    st.caption("GUI wizard -> `engine.pipeline_orchestrator.run_pipeline()`")

    if "effective_config" not in st.session_state:
        st.session_state["effective_config"] = {}
    if "last_run_id" not in st.session_state:
        st.session_state["last_run_id"] = None
    if "last_run_dir" not in st.session_state:
        st.session_state["last_run_dir"] = None

    with st.sidebar:
        render_terminal({"effective_config": st.session_state["effective_config"]})

    tab_upload, tab_settings, tab_run, tab_browse = st.tabs(
        ["📂 Upload", "⚙️ Settings", "▶️ Run", "📊 Browse"]
    )

    with tab_upload:
        st.subheader("A) Upload or select input")

        uploads = st.file_uploader(
            "Upload files (CSV, JSON, TXT, HTML, LOG)",
            accept_multiple_files=True,
            key="_uploads",
        )
        if uploads:
            st.success(f"Queued {len(uploads)} file(s): {[u.name for u in uploads]}")

        st.divider()

        local_path = st.text_input(
            "Or enter a local directory / file path",
            value="",
            key="_local_path",
            placeholder="/path/to/data/",
        )

        st.divider()

        st.checkbox(
            f"Use built-in demo dataset (`{_DEMO_DIR}`)",
            value=not bool(uploads) and not bool(local_path),
            key="_use_demo",
            disabled=not _DEMO_DIR.exists(),
            help="Runs the pipeline against samples/demo_ingest/ for quick smoke testing.",
        )

        if _DEMO_DIR.exists():
            demo_files = sorted(_DEMO_DIR.iterdir())
            st.caption(f"Demo files: {[f.name for f in demo_files if f.is_file()]}")
        else:
            st.caption("`samples/demo_ingest/` not found in working directory.")

    with tab_settings:
        st.subheader("B) Pipeline settings")

        cfg_path = Path("pipeline_config.yaml")
        defaults = _load_yaml_defaults(cfg_path) if cfg_path.exists() else {}

        col_left, col_right = st.columns(2)
        with col_left:
            offline_mode = st.toggle(
                "offline_mode",
                value=True,
                help="Disable all outbound network calls.",
            )
            enable_ml = st.toggle(
                "enable_ml_analysis",
                value=False,
                help="Enable ML/NLP stage (requires torch; off by default).",
            )
        with col_right:
            correlation_mode = st.selectbox(
                "correlation_mode",
                ["basic", "in-memory"],
                index=0,
                help="basic = simple entity matching; in-memory = graph in RAM.",
            )

        effective_config: dict = defaults.copy()
        effective_config.setdefault("runtime", {})
        effective_config["runtime"].update(
            {
                "offline_mode": offline_mode,
                "enable_ml_analysis": enable_ml,
                "correlation_mode": correlation_mode,
            }
        )
        st.session_state["effective_config"] = effective_config

        st.markdown("**Effective config (passed to engine):**")
        st.json(effective_config)

    with tab_run:
        st.subheader("C) Run pipeline")
        st.caption("Outputs are written to `runs/<run_id>/`.")

        input_path = _resolve_input(st.session_state)
        if input_path:
            st.info(f"Input resolved -> `{input_path}`")
        else:
            st.warning("No input selected. Go to Upload tab or enable demo dataset.")

        run_btn = st.button("🚀 Run pipeline now", type="primary", disabled=input_path is None)

        if run_btn and input_path:
            progress = st.progress(0, text="Starting...")
            log_area = st.empty()
            log_lines: list[str] = []

            def _log(message: str) -> None:
                log_lines.append(message)
                log_area.code("\n".join(log_lines[-40:]), language="bash")

            _log(f"Input: {input_path}")
            _log("Calling engine.pipeline_orchestrator.run_pipeline()...")
            progress.progress(10, text="Normalizing files...")

            try:
                ctx = run_pipeline(
                    input_path=input_path,
                    config=st.session_state["effective_config"],
                )
                st.session_state["last_run_id"] = ctx.run_id
                st.session_state["last_run_dir"] = str(ctx.run_dir)

                progress.progress(90, text="Generating report...")
                _log(f"Run ID:  {ctx.run_id}")
                _log(f"Run dir: {ctx.run_dir}")

                if ctx.stage_results:
                    for stage_name, stage_out in ctx.stage_results.items():
                        _log(f"  [{stage_out.status.value.upper():8s}] {stage_name}")

                progress.progress(100, text="Done")
                st.success(f"Pipeline complete - run `{ctx.run_id}`")
                st.code(str(ctx.run_dir))
                st.info("Switch to Browse tab to explore outputs.")

            except Exception as exc:
                progress.empty()
                st.error(f"Pipeline failed: {exc}")
                _log(f"ERROR: {exc}")

    with tab_browse:
        st.subheader("D) Browse results")

        run_dir_str = st.session_state.get("last_run_dir")
        if not run_dir_str:
            st.info("Run the pipeline first (Stage C).")
            return

        run_dir = Path(run_dir_str)
        report_html = run_dir / "report" / "index.html"
        db_path = run_dir / "db.sqlite"
        exports_dir = run_dir / "exports"
        manifest_path = run_dir / "manifest.json"

        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Run ID", st.session_state.get("last_run_id", "-"))
        col2.metric("Report", "yes" if report_html.exists() else "no")
        col3.metric("DB", "yes" if db_path.exists() else "no")
        col4.metric("Exports", str(len(list(exports_dir.rglob("*"))) if exports_dir.exists() else 0))

        if manifest_path.exists():
            with st.expander("Run manifest"):
                import json
                st.json(json.loads(manifest_path.read_text()))

        st.divider()

        st.markdown("### HTML Report")
        if report_html.exists():
            st.markdown(f"`{report_html}`")
            try:
                components.html(report_html.read_text(errors="replace"), height=700, scrolling=True)
            except Exception as exc:
                st.warning(f"Inline render failed ({exc}). Open the path above in a browser.")

            with open(report_html, "rb") as file_handle:
                st.download_button(
                    "Download report/index.html",
                    data=file_handle,
                    file_name="index.html",
                    mime="text/html",
                )
        else:
            st.info("No report/index.html yet.")

        st.divider()

        st.markdown("### Exports")
        if exports_dir.exists():
            export_files = sorted([path for path in exports_dir.rglob("*") if path.is_file()])
            if export_files:
                for export_file in export_files:
                    rel = export_file.relative_to(run_dir).as_posix()
                    col_path, col_download = st.columns([3, 1])
                    col_path.write(f"`{rel}`")
                    with open(export_file, "rb") as file_handle:
                        col_download.download_button(
                            "Download",
                            data=file_handle,
                            file_name=export_file.name,
                            key=f"dl_{rel}",
                        )
            else:
                st.info("Exports directory is empty.")
        else:
            st.info("No exports/ directory found.")

        jsonl_path = run_dir / "normalized.jsonl"
        if jsonl_path.exists():
            with open(jsonl_path, "rb") as file_handle:
                st.download_button(
                    "Download normalized.jsonl",
                    data=file_handle,
                    file_name="normalized.jsonl",
                    mime="application/x-ndjson",
                )

        st.divider()

        st.markdown("### SQLite DB Preview")
        if not db_path.exists():
            st.info("No db.sqlite found.")
            return

        with open(db_path, "rb") as file_handle:
            st.download_button(
                "Download db.sqlite",
                data=file_handle,
                file_name="db.sqlite",
                mime="application/x-sqlite3",
            )

        try:
            conn = sqlite3.connect(db_path)
            tables = pd.read_sql(
                "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
                conn,
            )["name"].tolist()
            if tables:
                st.write("Tables:", tables)
                selected_table = st.selectbox("Preview table", tables, key="db_table_sel")
                dataframe = pd.read_sql(f"SELECT * FROM [{selected_table}] LIMIT 200;", conn)
                st.dataframe(dataframe, use_container_width=True)
            else:
                st.info("DB exists but contains no tables yet.")
            conn.close()
        except Exception as exc:
            st.warning(f"DB preview failed: {exc}")


if __name__ == "__main__":
    main()