import json
from pathlib import Path
from typing import Any

from figment.config import FigmentConfig
from scripts import run_eval


INITIAL_CASES = Path("data/eval/initial_handwritten_cases.jsonl")


def _jsonl(path: Path) -> list[dict]:
    return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line]


class _FakeRule:
    def __init__(self, payload: dict[str, str]) -> None:
        self.payload = payload

    def to_dict(self) -> dict[str, str]:
        return dict(self.payload)


class _FiredCardOmittedModelClient:
    def __init__(self, *_: Any, **__: Any) -> None:
        pass

    def generate_json(self, *_: Any, **__: Any) -> dict[str, Any]:
        return {
            "protocol_urgency": "emergency",
            "red_flags": [_stroke_rule()],
            "intake_facts": [
                {
                    "fact": "Sudden one-sided weakness and trouble speaking.",
                    "status": "reported",
                    "source": "structured_field",
                }
            ],
            "candidate_protocol_pathways": [
                {
                    "card_id": "SAFETY-BOUNDARIES-v1",
                    "reason_relevant": "Safety boundaries are always relevant.",
                }
            ],
            "missing_info_to_collect": ["blood pressure if available"],
            "next_observations_to_collect": ["speech and one-sided weakness status"],
            "conflicts_or_uncertainties": ["Blood pressure not yet measured."],
            "responder_checklist": ["Keep deterministic red flags visible."],
            "do_not_do": ["Do not diagnose.", "Do not prescribe."],
            "source_cards": ["SAFETY-BOUNDARIES-v1", "REFERRAL-SBAR-v1"],
            "handoff_note_sbar": {
                "situation": "one-sided weakness",
                "background": "Age 56. Not pregnant.",
                "assessment_observations_only": "Sudden one-sided weakness and trouble speaking. Stroke sign red flag fired.",
                "handoff_request": "Request emergency review per cited local protocol cards.",
            },
            "responder_plain_language_script": "I am going to keep the stroke red flag visible and request emergency review.",
            "safety_boundary": "Prototype protocol navigation only; trained responder review required.",
        }


class _ObservationPatchRepairModelClient:
    calls = 0

    def __init__(self, *_: Any, **__: Any) -> None:
        pass

    def generate_json(self, _prompt: str, context: dict[str, Any] | None = None) -> dict[str, Any]:
        self.__class__.calls += 1
        if context and context.get("repair_scope") == "missing_observations":
            return {
                "missing_info_to_collect": [
                    "pregnancy or postpartum status",
                    "bleeding report",
                    "abdominal pain report",
                    "headache or vision symptoms",
                    "seizure or fainting report",
                    "fever report",
                ],
                "next_observations_to_collect": [
                    "pregnancy or postpartum status",
                    "bleeding report",
                    "abdominal pain report",
                    "headache or vision symptoms",
                    "seizure or fainting report",
                    "fever report",
                ],
            }
        rules = _postpartum_fever_rules()
        return {
            "protocol_urgency": "emergency",
            "red_flags": rules,
            "intake_facts": [
                {
                    "fact": "Postpartum fever with chills; blood pressure pending.",
                    "status": "reported",
                    "source": "structured_field",
                }
            ],
            "candidate_protocol_pathways": [
                {
                    "card_id": "FEVER-RED-FLAGS-v1",
                    "reason_relevant": "Fever during postpartum period fired the fever card.",
                },
                {
                    "card_id": "PREG-DANGER-SIGNS-v1",
                    "reason_relevant": "Postpartum fever also fired the pregnancy danger-sign card.",
                },
            ],
            "missing_info_to_collect": [
                "temperature if available",
                "age or pregnancy status",
                "mental status",
                "neck stiffness report",
                "rash report",
                "hydration observations",
                "available vital signs",
            ],
            "next_observations_to_collect": [
                "Check temperature if available.",
                "Assess mental status now.",
                "age or pregnancy status",
            ],
            "conflicts_or_uncertainties": ["Blood pressure is still pending."],
            "responder_checklist": ["Keep emergency escalation active per local protocol."],
            "do_not_do": ["Do not diagnose.", "Do not prescribe."],
            "source_cards": [
                "PREG-DANGER-SIGNS-v1",
                "FEVER-RED-FLAGS-v1",
                "SAFETY-BOUNDARIES-v1",
                "REFERRAL-SBAR-v1",
            ],
            "handoff_note_sbar": {
                "situation": "postpartum fever",
                "background": "Setting: flood shelter. Age: 44 years. Pregnancy status: postpartum two weeks.",
                "assessment_observations_only": (
                    "Symptoms: fever with chills. Vitals: temperature 101.5 F; pulse fast; "
                    "blood pressure pending. Red flags: Pregnancy danger sign; Fever escalation cue."
                ),
                "handoff_request": "Request emergency review/escalation per cited local protocol cards.",
            },
            "responder_plain_language_script": (
                "We need emergency review through the local pathway while we document the missing observations."
            ),
            "safety_boundary": "Prototype protocol navigation only; trained responder review required.",
            "selected_required_observation_ids": [
                "FEVER-RED-FLAGS-v1::required_observation::1",
                "FEVER-RED-FLAGS-v1::required_observation::2",
                "FEVER-RED-FLAGS-v1::required_observation::3",
                "FEVER-RED-FLAGS-v1::required_observation::4",
                "FEVER-RED-FLAGS-v1::required_observation::5",
                "FEVER-RED-FLAGS-v1::required_observation::6",
                "FEVER-RED-FLAGS-v1::required_observation::7",
            ],
        }


def _stroke_rule() -> dict[str, str]:
    return {
        "rule_id": "STROKE-001",
        "label": "Stroke sign",
        "urgency": "emergency",
        "evidence": "one-sided weakness",
        "card_id": "STROKE-SIGNS-v1",
    }


def _retrieved_without_stroke_cards() -> list[dict[str, Any]]:
    return [
        {
            "card_id": "SAFETY-BOUNDARIES-v1",
            "title": "Safety boundaries",
            "score": 1.0,
            "source": "test",
            "card": {
                "card_id": "SAFETY-BOUNDARIES-v1",
                "title": "Safety boundaries",
                "required_observations": [],
            },
        },
        {
            "card_id": "REFERRAL-SBAR-v1",
            "title": "Referral SBAR",
            "score": 0.9,
            "source": "test",
            "card": {
                "card_id": "REFERRAL-SBAR-v1",
                "title": "Referral SBAR",
                "required_observations": [],
            },
        },
    ]


def _postpartum_fever_rules() -> list[dict[str, str]]:
    return [
        {
            "rule_id": "PREG-001",
            "label": "Pregnancy danger sign",
            "urgency": "emergency",
            "evidence": "fever",
            "card_id": "PREG-DANGER-SIGNS-v1",
        },
        {
            "rule_id": "FEVER-001",
            "label": "Fever escalation cue",
            "urgency": "urgent",
            "evidence": "pregnancy/infant fever context",
            "card_id": "FEVER-RED-FLAGS-v1",
        },
    ]


def _retrieved_postpartum_fever_cards() -> list[dict[str, Any]]:
    return [
        {
            "card_id": "FEVER-RED-FLAGS-v1",
            "score": 1.0,
            "source": "test",
            "card": {
                "card_id": "FEVER-RED-FLAGS-v1",
                "title": "Fever escalation red flags",
                "required_observations": [
                    "temperature if available",
                    "age or pregnancy status",
                    "mental status",
                    "neck stiffness report",
                    "rash report",
                    "hydration observations",
                    "available vital signs",
                ],
                "red_flags": ["fever during pregnancy or postpartum"],
            },
        },
        {
            "card_id": "PREG-DANGER-SIGNS-v1",
            "score": 0.95,
            "source": "test",
            "card": {
                "card_id": "PREG-DANGER-SIGNS-v1",
                "title": "Pregnancy danger signs",
                "required_observations": [
                    "pregnancy or postpartum status",
                    "bleeding report",
                    "abdominal pain report",
                    "headache or vision symptoms",
                    "seizure or fainting report",
                    "fever report",
                    "available vital signs",
                ],
                "red_flags": ["fever with pregnancy or postpartum concern"],
            },
        },
        {
            "card_id": "SAFETY-BOUNDARIES-v1",
            "score": 0.8,
            "source": "test",
            "card": {
                "card_id": "SAFETY-BOUNDARIES-v1",
                "title": "Safety boundaries",
                "required_observations": ["confirmed intake status"],
            },
        },
        {
            "card_id": "REFERRAL-SBAR-v1",
            "score": 0.7,
            "source": "test",
            "card": {
                "card_id": "REFERRAL-SBAR-v1",
                "title": "Referral and SBAR format",
                "required_observations": ["situation or reason for handoff"],
            },
        },
    ]


def test_canned_eval_runner_keeps_fallback_out_of_model_competence(tmp_path: Path) -> None:
    output_path = tmp_path / "eval-results.jsonl"

    summary = run_eval.run_eval(
        case_paths=[INITIAL_CASES],
        output_path=output_path,
        config=FigmentConfig(model_backend="canned"),
    )

    records = _jsonl(output_path)
    assert summary["total_cases"] == 10
    assert len(records) == 10
    assert summary["raw_configured_model_successes"] == 0
    assert summary["repair_successes"] == 0
    assert summary["canned_fallback_successes"] == 10
    assert summary["competence_successes"] == 0
    assert summary["final_validation_successes"] == 10
    assert "expected_label_successes" in summary
    assert "expected_label_check_successes" in summary

    first = records[0]
    assert first["case_id"] == "initial-ams-confusion-001"
    assert first["model_backend"] == "canned"
    assert first["model_stack"] == "omni_native"
    assert first["active_model_id"]
    assert first["fallback_tier"] == "canned"
    assert first["fallback_reason"] == "canned_backend"
    assert first["raw_configured_model_attempted"] is False
    assert first["raw_configured_model_success"] is False
    assert first["repair_attempted"] is False
    assert first["repair_success"] is False
    assert first["canned_fallback_used"] is True
    assert first["canned_fallback_success"] is True
    assert first["competence_success"] is False
    assert first["final_validation"]["passed"] is True
    assert first["expected_source_card_ids"] == [
        "AMS-RED-FLAGS-v1",
        "SAFETY-BOUNDARIES-v1",
        "REFERRAL-SBAR-v1",
    ]
    assert first["expected_missing_observations"]
    assert first["forbidden_behavior"]
    assert first["actual_protocol_urgency"] == first["final_output"]["protocol_urgency"]
    assert first["actual_source_card_ids"] == first["final_output"]["source_cards"]
    assert "expected_candidate_pathway_card_ids" in first
    assert first["harness_evidence"]["validator_status"] == "passed"
    assert first["harness_evidence"]["fallback_tier"] == "canned"
    assert first["final_output"]["harness_evidence"] == first["harness_evidence"]
    assert "expected_label_score" in first
    assert first["expected_label_score"]["red_flags_match"] is True
    assert first["expected_label_score"]["min_urgency_met"] is True
    assert "harness_evidence_cues_visible" in first["expected_label_score"]
    assert first["field_provenance"]["protocol_urgency"] == "deterministic_fallback"
    assert summary["records_with_field_provenance"] == 10
    assert summary["model_field_pass_rate"] == 0.0
    assert summary["model_visible_fields_retained"] == 0.0
    assert summary["deterministic_patch_count"] == len(first["field_provenance"]) * 10
    assert first["latency_ms"] >= 0
    assert isinstance(first["trace_hash"], str)
    assert len(first["trace_hash"]) >= 12
    assert first["raw_model_output"] is None
    assert first["repaired_output"] is None
    assert isinstance(first["fallback_output"], dict)
    assert (output_path.parent / "eval_summary.json").exists()
    assert (output_path.parent / "eval_evidence_manifest.json").exists()
    manifest = json.loads((output_path.parent / "eval_evidence_manifest.json").read_text(encoding="utf-8"))
    assert manifest["all_trace_hashes_present"] is True
    assert manifest["scored_reporting_eligible"] is True


def test_eval_runner_repairs_known_fired_card_when_retrieval_missed_it(monkeypatch) -> None:
    monkeypatch.setattr(run_eval, "ModelClient", _FiredCardOmittedModelClient)
    monkeypatch.setattr(run_eval, "run_red_flag_checks", lambda _: [_FakeRule(_stroke_rule())])
    monkeypatch.setattr(run_eval, "search_protocol_cards", lambda *_args, **_kwargs: _retrieved_without_stroke_cards())

    record = run_eval._evaluate_case(
        {
            "case_id": "unit-stroke-retrieval-miss",
            "structured_intake": {
                "setting": "mobile clinic",
                "patient_age": "56",
                "pregnancy_status": "not_pregnant",
                "chief_concern": "one-sided weakness",
                "symptoms": "Sudden one-sided weakness and trouble speaking",
                "vitals": "blood pressure not yet measured; pulse fast",
                "responder_note": "Adult with acute stroke-sign concern.",
                "confirmed": True,
            },
            "target_protocol_card_id": "STROKE-SIGNS-v1",
            "expected_min_protocol_urgency": "emergency",
            "expected_red_flag_rule_ids": ["STROKE-001"],
            "expected_source_card_ids": ["STROKE-SIGNS-v1"],
            "expected_candidate_pathway_card_ids": ["STROKE-SIGNS-v1"],
        },
        FigmentConfig(model_backend="hosted_omni", nvidia_api_key="test-nvidia-key"),
    )

    assert record["final_validation"]["passed"] is True
    assert record["competence_success"] is False
    assert "STROKE-SIGNS-v1" not in record["raw_model_output"]["source_cards"]
    assert "STROKE-SIGNS-v1" not in {
        pathway["card_id"] for pathway in record["raw_model_output"]["candidate_protocol_pathways"]
    }
    assert "STROKE-SIGNS-v1" in record["scaffolded_model_output"]["source_cards"]
    assert "STROKE-SIGNS-v1" in {
        pathway["card_id"] for pathway in record["scaffolded_model_output"]["candidate_protocol_pathways"]
    }
    assert "STROKE-SIGNS-v1" in record["final_output"]["source_cards"]
    assert "STROKE-SIGNS-v1" in record["actual_candidate_pathway_card_ids"]
    assert record["field_provenance"]["source_cards"] == "deterministic_fallback"
    assert record["field_provenance"]["candidate_protocol_pathways"] == "deterministic_fallback"
    assert record["expected_label_score"]["target_card_in_source_cards"] is True
    assert record["expected_label_score"]["target_card_in_candidate_pathways"] is True


def test_eval_runner_repairs_model_observation_patch_fields(monkeypatch) -> None:
    _ObservationPatchRepairModelClient.calls = 0
    monkeypatch.setattr(run_eval, "ModelClient", _ObservationPatchRepairModelClient)
    monkeypatch.setattr(
        run_eval,
        "run_red_flag_checks",
        lambda _: [_FakeRule(rule) for rule in _postpartum_fever_rules()],
    )
    monkeypatch.setattr(run_eval, "search_protocol_cards", lambda *_args, **_kwargs: _retrieved_postpartum_fever_cards())

    record = run_eval._evaluate_case(
        {
            "case_id": "unit-postpartum-fever-observation-repair",
            "structured_intake": {
                "setting": "flood shelter",
                "patient_age": "44 years",
                "pregnancy_status": "postpartum two weeks",
                "chief_concern": "postpartum fever",
                "symptoms": "fever with chills during postpartum period",
                "vitals": "temperature 101.5 F; pulse fast; blood pressure pending",
                "responder_note": "Confirmed postpartum fever concern.",
                "confirmed": True,
            },
            "target_protocol_card_id": "FEVER-RED-FLAGS-v1",
            "expected_min_protocol_urgency": "emergency",
            "expected_red_flag_rule_ids": ["PREG-001", "FEVER-001"],
            "expected_source_card_ids": ["PREG-DANGER-SIGNS-v1", "FEVER-RED-FLAGS-v1"],
            "expected_candidate_pathway_card_ids": ["FEVER-RED-FLAGS-v1"],
        },
        FigmentConfig(model_backend="hosted_omni", nvidia_api_key="test-nvidia-key"),
    )

    assert _ObservationPatchRepairModelClient.calls == 2
    assert record["final_validation"]["passed"] is True
    assert record["raw_configured_model_success"] is False
    assert record["repair_attempted"] is True
    assert record["repair_success"] is True
    assert record["competence_success"] is True
    assert record["field_level_fallback_used"] is False
    assert record["deterministic_scaffold_patched_fields"] == [
        "missing_info_to_collect",
        "next_observations_to_collect",
    ]
    assert record["field_provenance"]["missing_info_to_collect"] == "model_repaired"
    assert record["field_provenance"]["next_observations_to_collect"] == "model_repaired"
    assert "PREG-DANGER-SIGNS-v1::required_observation::2" in record["filled_required_observation_ids"]
    assert "bleeding report" in record["final_output"]["missing_info_to_collect"]
    assert "selected_required_observation_ids" not in record["final_output"]


def test_eval_cli_runs_initial_cases_against_canned_without_network(tmp_path: Path) -> None:
    output_path = tmp_path / "cli-results.jsonl"

    exit_code = run_eval.main(
        [
            "--backend",
            "canned",
            "--cases",
            str(INITIAL_CASES),
            "--output",
            str(output_path),
        ]
    )

    records = _jsonl(output_path)
    assert exit_code == 0
    assert len(records) == 10
    assert {record["raw_configured_model_success"] for record in records} == {False}
    assert {record["canned_fallback_used"] for record in records} == {True}
    assert {record["final_validation"]["passed"] for record in records} == {True}
    assert {record["field_provenance"]["source_cards"] for record in records} == {"deterministic_fallback"}
    assert all("expected_label_score" in record for record in records)


def test_llama_eval_summary_describes_real_eval_evidence_scope(tmp_path: Path) -> None:
    summary = run_eval._summarize(
        [
            {
                "raw_configured_model_success": True,
                "repair_success": False,
                "canned_fallback_used": False,
                "canned_fallback_success": False,
                "competence_success": True,
                "final_validation": {"passed": True},
            }
        ],
        FigmentConfig(model_backend="llama_cpp", model_stack="local_4b_parakeet"),
        [INITIAL_CASES],
        tmp_path / "local-eval.jsonl",
    )

    assert summary["local_llm_evidence"]["proof_status"] == "eval_records_summarized"
    assert summary["local_llm_evidence"]["model_backend"] == "llama_cpp"
    assert summary["local_llm_evidence"]["counts_as_50_case_local_llm_competence"] is False
    assert summary["local_llm_evidence"]["competence_successes"] == 1
    assert summary["local_llm_evidence"]["scored_reporting_eligible"] is True
    assert summary["local_llm_evidence"]["models_endpoint"]["available"] is False
    assert "MODEL_BACKEND=llama_cpp" in summary["local_llm_evidence"]["real_eval_command"]


def test_runtime_errors_mark_local_eval_ineligible_for_scored_reporting(tmp_path: Path) -> None:
    summary = run_eval._summarize(
        [
            {
                "raw_configured_model_success": False,
                "repair_success": False,
                "canned_fallback_used": True,
                "canned_fallback_success": True,
                "competence_success": False,
                "raw_validation": {
                    "passed": False,
                    "failures": ["model backend error: http_status=500 reason=failed to find free space in the KV cache"],
                },
                "final_validation": {"passed": True},
            }
        ],
        FigmentConfig(model_backend="llama_cpp", model_stack="local_4b_parakeet"),
        [INITIAL_CASES],
        tmp_path / "local-eval.jsonl",
    )

    assert summary["scored_reporting_eligible"] is False
    assert summary["runtime_error_summary"]["server_http_500"] is True
    assert summary["runtime_error_summary"]["kv_cache_failure"] is True