Spaces:

kmkarakaya
/

openLLMbenchmark

Running

File size: 4,997 Bytes

371efe0

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any


ID_PATTERN = re.compile(r"^q\d{3,}$")
DEFAULT_SYSTEM_PROMPT = (
    "You are a helpful assistant. Answer in the same language as the user's question unless explicitly asked otherwise."
)


class DatasetValidationError(ValueError):
    pass


def _load_raw_dataset(path: Path) -> Any:
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found: {path}")
    with path.open("r", encoding="utf-8") as file:
        return json.load(file)


def _extract_records(raw_payload: Any) -> list[dict[str, Any]]:
    if isinstance(raw_payload, list):
        return raw_payload
    if isinstance(raw_payload, dict) and isinstance(raw_payload.get("questions"), list):
        return raw_payload["questions"]
    raise DatasetValidationError(
        "benchmark.json must be either a list of questions or an object with a 'questions' list."
    )


def _require_text_field(record: dict[str, Any], field_name: str, index: int) -> str:
    if field_name not in record:
        raise DatasetValidationError(f"Record #{index} is missing required field '{field_name}'.")
    value = str(record.get(field_name, "")).strip()
    if not value:
        raise DatasetValidationError(f"Record #{index} has empty '{field_name}'.")
    return value


def validate_question_records(records: list[dict[str, Any]]) -> None:
    seen_ids: set[str] = set()
    for index, record in enumerate(records, start=1):
        if not isinstance(record, dict):
            raise DatasetValidationError(f"Record #{index} must be an object.")

        question_id = _require_text_field(record, "id", index)
        if not ID_PATTERN.match(question_id):
            raise DatasetValidationError(
                f"Record #{index} has invalid id '{question_id}'. Expected format like q001."
            )
        if question_id in seen_ids:
            raise DatasetValidationError(f"Duplicate question id found: {question_id}")
        seen_ids.add(question_id)

        _require_text_field(record, "question", index)
        _require_text_field(record, "expected_answer", index)


def load_benchmark_payload(dataset_path: Path) -> dict[str, Any]:
    raw_payload = _load_raw_dataset(dataset_path)
    records = _extract_records(raw_payload)
    validate_question_records(records)

    questions: list[dict[str, Any]] = []
    for record in records:
        questions.append(
            {
                "id": str(record["id"]).strip(),
                "prompt": str(record["question"]).strip(),
                "expected_answer": str(record["expected_answer"]).strip(),
                "category": str(record.get("topic", "GENEL")).strip() or "GENEL",
                "expected_source": "benchmark_json",
                "confidence": 1.0,
                "hardness_level": str(record.get("hardness_level", "")).strip(),
                "why_prepared": str(record.get("why_prepared", "")).strip(),
            }
        )

    return {"instruction": DEFAULT_SYSTEM_PROMPT, "questions": questions}


def save_expected_answer(dataset_path: Path, question_id: str, expected_answer: str) -> None:
    normalized_answer = expected_answer.strip()
    if not normalized_answer:
        raise DatasetValidationError("expected_answer cannot be empty.")

    raw_payload = _load_raw_dataset(dataset_path)
    records = _extract_records(raw_payload)
    validate_question_records(records)

    found = False
    for record in records:
        if str(record.get("id", "")).strip() == question_id:
            record["expected_answer"] = normalized_answer
            found = True
            break

    if not found:
        raise KeyError(f"Question id not found: {question_id}")

    with dataset_path.open("w", encoding="utf-8") as file:
        json.dump(raw_payload, file, ensure_ascii=False, indent=2)


def backfill_missing_ids(dataset_path: Path) -> None:
    raw_payload = _load_raw_dataset(dataset_path)
    records = _extract_records(raw_payload)

    existing_numbers: set[int] = set()
    for record in records:
        raw_id = str(record.get("id", "")).strip()
        if ID_PATTERN.match(raw_id):
            existing_numbers.add(int(raw_id[1:]))

    next_number = 1 if not existing_numbers else (max(existing_numbers) + 1)
    changed = False

    for index, record in enumerate(records, start=1):
        raw_id = str(record.get("id", "")).strip()
        if raw_id:
            continue

        candidate_number = index if not existing_numbers else next_number
        while candidate_number in existing_numbers:
            candidate_number += 1

        record["id"] = f"q{candidate_number:03d}"
        existing_numbers.add(candidate_number)
        next_number = candidate_number + 1
        changed = True

    validate_question_records(records)

    if changed:
        with dataset_path.open("w", encoding="utf-8") as file:
            json.dump(raw_payload, file, ensure_ascii=False, indent=2)