"""Tests for QA evaluation dataset schema validity."""

import json
import os
import pytest
from pathlib import Path

EVAL_DIR = Path(__file__).parent.parent / "eval"
REQUIRED_FIELDS = {"id", "question", "gold_answer", "relevant_chunk_ids", "expected_terms", "difficulty"}
VALID_DIFFICULTIES = {"easy", "medium", "hard"}


def iter_jsonl(path: Path):
    """Yield parsed rows from a JSONL file."""
    with open(path, "r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            yield line_no, json.loads(line)


def get_eval_files():
    """Find all .jsonl files in eval/ directory."""
    if not EVAL_DIR.exists():
        return []
    return list(EVAL_DIR.glob("*.jsonl"))


@pytest.fixture(params=get_eval_files(), ids=lambda p: p.name)
def eval_file(request):
    return request.param


def test_eval_files_exist():
    """At least one eval JSONL file should exist."""
    files = get_eval_files()
    assert len(files) > 0, "No eval JSONL files found in eval/ directory"


def test_eval_file_has_required_fields(eval_file):
    """Every row in every eval file must have all required fields."""
    for line_no, row in iter_jsonl(eval_file):
        missing = REQUIRED_FIELDS - set(row.keys())
        assert not missing, f"{eval_file.name}:{line_no} missing fields: {missing}"


def test_eval_file_has_valid_types(eval_file):
    """Field types must be correct."""
    for line_no, row in iter_jsonl(eval_file):
        assert isinstance(row["id"], str), f"{eval_file.name}:{line_no} id must be string"
        assert isinstance(row["question"], str), f"{eval_file.name}:{line_no} question must be string"
        assert isinstance(row["gold_answer"], str), f"{eval_file.name}:{line_no} gold_answer must be string"
        assert isinstance(row["relevant_chunk_ids"], list), f"{eval_file.name}:{line_no} relevant_chunk_ids must be list"
        assert isinstance(row["expected_terms"], list), f"{eval_file.name}:{line_no} expected_terms must be list"
        assert isinstance(row["difficulty"], str), f"{eval_file.name}:{line_no} difficulty must be string"


def test_eval_file_has_valid_difficulty(eval_file):
    """Difficulty must be easy, medium, or hard."""
    for line_no, row in iter_jsonl(eval_file):
        assert row["difficulty"] in VALID_DIFFICULTIES, (
            f"{eval_file.name}:{line_no} invalid difficulty: {row['difficulty']}"
        )


def test_eval_file_has_unique_ids(eval_file):
    """All question IDs within a file must be unique."""
    ids = []
    for _, row in iter_jsonl(eval_file):
        ids.append(row["id"])
    assert len(ids) == len(set(ids)), f"{eval_file.name} has duplicate IDs"


def test_eval_file_questions_not_empty(eval_file):
    """Questions must not be empty."""
    for line_no, row in iter_jsonl(eval_file):
        assert len(row["question"].strip()) > 5, (
            f"{eval_file.name}:{line_no} question too short: {row['question']}"
        )


def test_starter_file_has_15_questions():
    """The 15-question starter file must have exactly 15 rows."""
    starter = EVAL_DIR / "qa_15_starter.jsonl"
    if not starter.exists():
        pytest.skip("qa_15_starter.jsonl not found")
    rows = list(iter_jsonl(starter))
    assert len(rows) == 15, f"Expected 15 questions, got {len(rows)}"


def test_full_file_has_50_questions():
    """The 50-question file must have exactly 50 rows."""
    full = EVAL_DIR / "qa_50_graphresearcher.jsonl"
    if not full.exists():
        pytest.skip("qa_50_graphresearcher.jsonl not found")
    rows = list(iter_jsonl(full))
    assert len(rows) == 50, f"Expected 50 questions, got {len(rows)}"