"""
HuggingFace Spaces entry point for the Legal Text Anonymisation demo.

This is a Spaces-tailored copy of the main repo's demo/app.py:
- NER backbone forced to `en_core_web_sm` for fast cold-start (the
  fine-tuned RoBERTa checkpoint isn't bundled — too large for the free tier).
- Working directory layout is flat (no `src/` prefix), so the imports
  resolve via the local `anonymisation/` package.

To run locally for testing:
    pip install -r requirements.txt
    python -m spacy download en_core_web_sm
    python app.py
"""
from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Callable, List, Tuple

import gradio as gr

from anonymisation.mapping import SPACY_TO_TAB
from anonymisation.pipeline import LitePipeline, ProPipeline, MosaicScorer


_state = {"predictor": None, "backend_label": None, "scorer": None}


def _make_spacy_predictor(model_name: str) -> Tuple[Callable, str]:
    import spacy
    nlp = spacy.load(model_name)

    def predict(text: str):
        doc = nlp(text)
        return [
            (e.start_char, e.end_char, SPACY_TO_TAB[e.label_], e.text)
            for e in doc.ents if e.label_ in SPACY_TO_TAB
        ]
    return predict, f"spaCy {model_name}"


def _get_predictor() -> Tuple[Callable, str]:
    if _state["predictor"] is not None:
        return _state["predictor"], _state["backend_label"]
    model_name = os.environ.get("SPACY_MODEL", "en_core_web_sm")
    predict, label = _make_spacy_predictor(model_name)
    _state["predictor"] = predict
    _state["backend_label"] = label
    print(f"[spaces] Loaded NER backend: {label}")
    return predict, label


def _get_scorer() -> MosaicScorer:
    if _state["scorer"] is None:
        try:
            from anonymisation.data import load_tab
            ds = load_tab()
            _state["scorer"] = MosaicScorer.from_tab(list(ds["test"]))
        except Exception as exc:
            print(f"[spaces] Could not load TAB ({exc}); empty haystack.")
            _state["scorer"] = MosaicScorer.empty()
    return _state["scorer"]


def _backend_badge() -> str:
    label = _state["backend_label"] or _get_predictor()[1]
    return f"**NER backend:** `{label}` · **Coref:** Phase 5 post-processor enabled"


def redact(text: str, variant: str, k_target: int, max_iters: int,
           pseudonymise: bool, coref_extend: bool):
    if not text or not text.strip():
        return (
            "", _backend_badge(),
            "_paste some text on the left and click Redact_",
            "_(no vault)_", "",
        )

    predictor, _ = _get_predictor()

    if variant.startswith("Redact"):
        pipeline = LitePipeline(
            ner_provider=predictor,
            coref_extend=coref_extend,
            pseudonymise=pseudonymise,
        )
    else:
        pipeline = ProPipeline(
            ner_provider=predictor, scorer=_get_scorer(),
            k_target=k_target, max_iterations=max_iters,
            coref_extend=coref_extend, pseudonymise=pseudonymise,
        )

    result = pipeline(text)

    parts: List[str] = []
    if variant.startswith("Anonymise"):
        ck = "✓ converged" if result.converged else "✗ fallback to suppression"
        parts.append(
            f"**Mosaic risk:** k_initial = `{result.mosaic_risk_initial}` → "
            f"k_final = `{result.mosaic_risk_final}` · "
            f"iterations: `{result.iterations_used}` · {ck}"
        )
    if pseudonymise:
        parts.append(f"**Vault:** {len(result.pseudonym_vault)} entries (see Vault tab)")
    status = "  ·  ".join(parts) if parts else "_(no status — Redact mode: direct identifiers only)_"

    if result.pseudonym_vault:
        vault_md = "| Token | Original surface form |\n|---|---|\n" + "\n".join(
            f"| `{tok}` | {orig} |" for tok, orig in result.pseudonym_vault.items()
        )
    else:
        vault_md = "_(Pseudonymisation off — enable to see referential tokens here)_"

    return (
        result.redacted_text,
        _backend_badge(),
        status,
        vault_md,
        json.dumps(result.to_dict(), indent=2, ensure_ascii=False),
    )


def _load_examples() -> List[List]:
    examples_dir = Path(__file__).parent / "examples"
    if not examples_dir.exists():
        return []
    return [
        [
            path.read_text().strip(),
            "Anonymise (mosaic-aware)", 5, 3, False, True,
        ]
        for path in sorted(examples_dir.glob("*.txt"))
    ]


DESCRIPTION = """\
# Legal Text Anonymisation

> ⚠️ **This is a lightweight version of the full project.** To boot quickly on a
> free CPU, this demo runs spaCy's small `en_core_web_sm` model for name
> detection, so it *will* miss names the full project catches. The real pipeline
> uses a RoBERTa model fine-tuned on legal text (F1 0.85 vs ~0.4 here) — see the
> [portfolio write-up](https://github.com/WillCatt/Data-Anonymisation) for the
> full results. What's faithful here is the *logic*: the redaction modes, the
> mosaic re-identification scoring, and the audit trail.

Paste a document on the left and pick a mode. The full per-decision audit log is
on the **Audit log** tab.

- **Redact** — removes the direct identifiers (names, organisations, case and
  reference numbers, IBANs). The quick option.
- **Anonymise** — Redact, plus mosaic-aware generalisation: it broadens the
  remaining everyday details (dates, places, demographics) one step at a time
  until the document's residual fingerprint is no longer unique (reaches the
  target k-anonymity), so it can't be traced back to one person.
- **Pseudonymise** — assigns stable referential tokens (`[PERSON_A]`, `[PERSON_B]` …)
  instead of plain `[PERSON]`. The Vault tab shows the mapping; restore() reverses
  it locally so an external LLM's answer comes back with the real names.

> The mosaic scorer compares against the TAB corpus (1,268 ECHR cases) as a
> methodological stand-in for a firm's own document corpus.
"""


def _safe_textbox(**kwargs):
    try:
        return gr.Textbox(**kwargs)
    except TypeError as exc:
        bad = str(exc).split("'")[1] if "'" in str(exc) else None
        if bad and bad in kwargs:
            kwargs.pop(bad)
            return _safe_textbox(**kwargs)
        return gr.Textbox(label=kwargs.get("label", ""), lines=kwargs.get("lines", 10))


def build_ui():
    with gr.Blocks(title="Legal Text Anonymisation") as demo:
        gr.Markdown(DESCRIPTION)
        _get_predictor()
        backend_md = gr.Markdown(_backend_badge())

        with gr.Row():
            with gr.Column(scale=1):
                input_text = gr.Textbox(
                    label="Input",
                    placeholder="Paste legal-style text here…",
                    lines=14,
                )
                variant = gr.Radio(
                    choices=["Redact (direct identifiers)", "Anonymise (mosaic-aware)"],
                    value="Anonymise (mosaic-aware)",
                    label="Mode",
                )
                with gr.Row():
                    k_target = gr.Slider(2, 10, value=5, step=1, label="Anonymise: k_target")
                    max_iters = gr.Slider(1, 5, value=3, step=1, label="Anonymise: max iterations")
                with gr.Row():
                    pseudonymise = gr.Checkbox(value=False, label="Pseudonymise")
                    coref_extend = gr.Checkbox(value=True, label="Coref extension")
                run_btn = gr.Button("Redact", variant="primary")

            with gr.Column(scale=1):
                with gr.Tabs():
                    with gr.Tab("Redacted"):
                        redacted_out = _safe_textbox(
                            label="Redacted text", lines=12, show_copy_button=True,
                        )
                        status_md = gr.Markdown()
                    with gr.Tab("Vault"):
                        vault_md = gr.Markdown(
                            value="_(Pseudonymisation off — enable to see referential tokens here)_"
                        )
                    with gr.Tab("Audit log"):
                        audit_out = gr.Code(
                            label="Full per-decision audit (JSON)", language="json"
                        )

        examples = _load_examples()
        if examples:
            gr.Examples(
                examples=examples,
                inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
                label="Pre-canned examples",
                # HF Spaces defaults to caching examples, which requires fn+outputs
                # and would run the pipeline on every example at boot. Disable it.
                cache_examples=False,
            )

        run_btn.click(
            fn=redact,
            inputs=[input_text, variant, k_target, max_iters, pseudonymise, coref_extend],
            outputs=[redacted_out, backend_md, status_md, vault_md, audit_out],
        )

    return demo


if __name__ == "__main__":
    app = build_ui()
    # HF Spaces sets PORT=7860 by default; honour it if present.
    app.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", "7860")),
    )