Spaces:

XQ
/

Dokumentassistent

Running

File size: 15,838 Bytes

a493f04

"""Auto-generate an English QA evaluation test set from Danish PDF documents.

Calls an LLM (default: Qwen3-32B via Groq) to produce question / reference /
source-quote triples grounded in the source PDFs. The generated draft is
written to ``eval/qa_set_draft.yaml`` for human review and curation into the
final ``eval/qa_set.yaml``.

Each generated entry contains:
    question:           English question
    reference_en:       English reference answer (1–3 sentences)
    source_quote_da:    Verbatim Danish substring of the PDF page text
    source_doc:         PDF filename
    source_page_start:  First page of the section
    source_page_end:    Last page of the section
    category:           "fact" | "procedural" | "definition"
    quote_verified:     True if the Danish quote was found verbatim in the PDF
    reviewed:           Set to True manually after human review

Usage:
    python -m scripts.generate_qa_set [--max-sections-per-doc 3]
                                       [--questions-per-section 2]

Env vars (.env):
    LLM_PROVIDER=groq
    GROQ_API_KEY=gsk_...
    GROQ_MODEL=qwen/qwen3-32b
"""

import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any

import yaml
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage, SystemMessage

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.insert(0, PROJECT_ROOT)

from src.config import load_settings  # noqa: E402
from src.ingestion.pdf_parser import PDFParser  # noqa: E402
from src.ingestion.text_cleaner import TextCleaner  # noqa: E402
from src.provider import create_llm  # noqa: E402

logger = logging.getLogger(__name__)

DOCS_DIR = os.path.join(PROJECT_ROOT, "docs")
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "eval", "qa_set_draft.yaml")

# Approximate character budget per section sent to the LLM. ~8K chars of
# Danish text is roughly 2.5K tokens, well within Qwen3-32B's 131K window
# even with a verbose system prompt.
_SECTION_CHAR_TARGET = 8000


_SYSTEM_PROMPT = """You are a question generator for a multilingual RAG \
evaluation test set.

You are given a section of a Danish university policy / regulation document. \
Your task is to generate factual question / answer pairs that test whether a \
retrieval system can find the right passage and answer the question correctly.

Strict rules:
1. Questions must be in ENGLISH (the test set targets English-speaking users \
querying Danish documents).
2. Each `reference_en` must be in ENGLISH and faithful to the source — do \
not add information not present in the text.
3. Each `source_quote_da` MUST be an EXACT verbatim substring of the source \
text I gave you. Do not paraphrase, do not translate, do not summarize. \
If you cannot find a clean verbatim quote that supports the answer, do \
not generate that question.
4. Quote length: keep `source_quote_da` between 30 and 400 characters — long \
enough to fully support the answer, short enough to be specific.
5. Categories: each question must be tagged with one of:
   - "fact": single specific fact (a number, a deadline, a definition, a rule)
   - "procedural": describes a process or sequence of steps
   - "definition": defines a term or concept
6. Avoid trivially generic questions like "What does the document say about \
X?" — questions should be answerable in 1-3 sentences with specific content.
7. Output STRICT JSON only, no markdown, no commentary, no thinking. Schema:

{
  "questions": [
    {
      "question": "string (English)",
      "reference_en": "string (English, 1-3 sentences)",
      "source_quote_da": "string (verbatim Danish substring of input)",
      "category": "fact" | "procedural" | "definition"
    }
  ]
}
"""


def _section_pages(
    pages: list[dict[str, str | int]], target_chars: int
) -> list[dict[str, Any]]:
    """Group consecutive pages into sections of approximately target_chars.

    Args:
        pages: List of page dicts from PDFParser.
        target_chars: Approximate character budget per section.

    Returns:
        List of section dicts with 'text', 'page_start', 'page_end'.
    """
    cleaner = TextCleaner()
    sections: list[dict[str, Any]] = []
    buf: list[str] = []
    buf_pages: list[int] = []
    buf_chars = 0

    for page in pages:
        cleaned = cleaner.clean(str(page["text"]))
        cleaned = cleaner.remove_headers_footers(cleaned)
        if not cleaned.strip():
            continue
        page_no = int(page["page_number"])
        buf.append(cleaned)
        buf_pages.append(page_no)
        buf_chars += len(cleaned)
        if buf_chars >= target_chars:
            sections.append(
                {
                    "text": "\n\n".join(buf),
                    "page_start": buf_pages[0],
                    "page_end": buf_pages[-1],
                }
            )
            buf, buf_pages, buf_chars = [], [], 0

    if buf:
        sections.append(
            {
                "text": "\n\n".join(buf),
                "page_start": buf_pages[0],
                "page_end": buf_pages[-1],
            }
        )

    return sections


# Soft-hyphen line break: a hyphen at end of line followed by a lowercase
# letter is PDF reflow artefact (e.g. "dæknings-\nområde"), not a real hyphen.
_HYPHEN_LINEBREAK_RE = re.compile(r"-\s*\n\s*(?=[a-zæøå])")

# Quote and bullet glyphs whose presence differs between PDF text extraction
# and LLM-generated quotes. Stripping them entirely makes matching robust to
# straight-vs-curly quotes and to bullets that PyMuPDF drops on extraction.
_STRIP_CHARS = (
    "\u2018\u2019\u201a\u201b"  # single curly quotes
    "\u201c\u201d\u201e\u201f"  # double curly quotes
    "\u00ab\u00bb"               # « »
    "'\""                          # straight quotes
    "\u2022\u2023\u00b7\u25aa\u25e6\u25cf\u25cb"  # bullet glyphs
)
_STRIP_TRANSLATE = str.maketrans({c: "" for c in _STRIP_CHARS})

# Word/Wingdings list bullets land in the Unicode Private Use Area (e.g.
# U+F0B7) when extracted from PDFs. Drop the whole BMP PUA range.
_PUA_RE = re.compile(r"[\ue000-\uf8ff]")


def _normalize_for_match(s: str) -> str:
    """Normalize text for tolerant verbatim-quote matching.

    Heals PDF-style soft hyphens at line breaks (e.g. ``dæknings-\\nområde``
    → ``dækningsområde``), removes quote characters whose straight/curly
    variants differ between PDF extraction and LLM output, drops bullet
    glyphs that PDF extraction tends to discard, and collapses whitespace
    runs to a single space.

    Args:
        s: Input string.

    Returns:
        Normalized string suitable for substring comparison.
    """
    s = _HYPHEN_LINEBREAK_RE.sub("", s)
    s = _PUA_RE.sub("", s)
    s = s.translate(_STRIP_TRANSLATE)
    return re.sub(r"\s+", " ", s).strip()


def _verify_quote(quote: str, source_text: str) -> bool:
    """Verify the quote is a verbatim substring of source_text after whitespace normalization.

    Args:
        quote: Candidate Danish quote produced by the LLM.
        source_text: Full source text the quote should originate from.

    Returns:
        True if the quote is found verbatim (modulo whitespace) in the source.
    """
    return _normalize_for_match(quote) in _normalize_for_match(source_text)


def _parse_llm_json(raw: str) -> dict[str, Any]:
    """Extract a JSON object from an LLM response, tolerating code fences and prose.

    Args:
        raw: Raw LLM output string.

    Returns:
        Parsed JSON dict.

    Raises:
        ValueError: If no valid JSON object could be extracted.
    """
    text = raw.strip()
    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    if fence:
        text = fence.group(1)
    else:
        start = text.find("{")
        end = text.rfind("}")
        if start >= 0 and end > start:
            text = text[start : end + 1]
    try:
        return json.loads(text)
    except json.JSONDecodeError as exc:
        raise ValueError(
            f"LLM returned invalid JSON: {exc}\nRaw (first 500 chars): {raw[:500]}"
        ) from exc


def _generate_for_section(
    llm: BaseChatModel,
    source_doc: str,
    section: dict[str, Any],
    questions_per_section: int,
) -> list[dict[str, Any]]:
    """Call the LLM to generate QA pairs for one document section.

    Args:
        llm: LangChain BaseChatModel instance.
        source_doc: Filename of the source document.
        section: Section dict with text, page_start, page_end.
        questions_per_section: Target number of questions to generate.

    Returns:
        List of QA dicts (may be empty if the LLM fails or no quotes verify).
    """
    user_prompt = (
        f"Generate exactly {questions_per_section} question/answer pairs from "
        f"the following Danish text. Remember: questions and reference answers "
        f"in ENGLISH, source_quote_da must be VERBATIM from the text below.\n\n"
        f"--- SOURCE TEXT ---\n{section['text']}\n--- END SOURCE TEXT ---"
    )
    messages = [
        SystemMessage(content=_SYSTEM_PROMPT),
        HumanMessage(content=user_prompt),
    ]
    try:
        response = llm.invoke(messages)
        raw = response.content if hasattr(response, "content") else str(response)
    except Exception as exc:
        logger.error(
            "LLM call failed for %s pages %d-%d: %s",
            source_doc,
            section["page_start"],
            section["page_end"],
            exc,
        )
        return []

    if not isinstance(raw, str):
        raw = str(raw)

    try:
        parsed = _parse_llm_json(raw)
    except ValueError as exc:
        logger.warning("Failed to parse LLM JSON for %s: %s", source_doc, exc)
        return []

    items = parsed.get("questions", [])
    if not isinstance(items, list):
        logger.warning("LLM 'questions' field is not a list for %s", source_doc)
        return []

    result: list[dict[str, Any]] = []
    for item in items:
        if not isinstance(item, dict):
            continue
        question = item.get("question")
        reference_en = item.get("reference_en")
        source_quote_da = item.get("source_quote_da")
        category = item.get("category", "fact")
        if not (
            isinstance(question, str)
            and isinstance(reference_en, str)
            and isinstance(source_quote_da, str)
        ):
            continue
        if not (question.strip() and reference_en.strip() and source_quote_da.strip()):
            continue
        verified = _verify_quote(source_quote_da, section["text"])
        result.append(
            {
                "question": question.strip(),
                "reference_en": reference_en.strip(),
                "source_quote_da": source_quote_da.strip(),
                "source_doc": source_doc,
                "source_page_start": section["page_start"],
                "source_page_end": section["page_end"],
                "category": category if category in {"fact", "procedural", "definition"} else "fact",
                "quote_verified": verified,
                "reviewed": False,
            }
        )
    return result


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments.

    Returns:
        Parsed argument namespace.
    """
    parser = argparse.ArgumentParser(
        description="Auto-generate an English QA test set from Danish PDFs.",
    )
    parser.add_argument(
        "--max-sections-per-doc",
        type=int,
        default=3,
        help="Max sections to process per PDF (caps total questions; default 3).",
    )
    parser.add_argument(
        "--questions-per-section",
        type=int,
        default=2,
        help="Number of QA pairs to request per section (default 2).",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=OUTPUT_PATH,
        help=f"Output YAML path (default: {OUTPUT_PATH}).",
    )
    parser.add_argument(
        "--limit-docs",
        type=int,
        default=0,
        help="Process at most N PDFs (0 = all). Useful for smoke testing.",
    )
    return parser.parse_args()


def main() -> None:
    """Generate the QA draft and write it to YAML."""
    args = parse_args()
    settings = load_settings()

    logging.basicConfig(
        level=getattr(logging, settings.log_level.upper(), logging.INFO),
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    )

    if settings.llm_provider != "groq":
        logger.warning(
            "LLM_PROVIDER is '%s', not 'groq'. The QA generator works with any "
            "provider but Qwen3-32B via Groq is recommended.",
            settings.llm_provider,
        )

    model_label = (
        settings.groq_model if settings.llm_provider == "groq" else settings.generation_model
    )
    logger.info("=== QA Draft Generation Start ===")
    logger.info("LLM provider: %s | model: %s", settings.llm_provider, model_label)

    llm = create_llm(settings)
    parser = PDFParser()

    pdf_files = sorted(f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf"))
    if args.limit_docs > 0:
        pdf_files = pdf_files[: args.limit_docs]
    logger.info("Found %d PDFs in %s", len(pdf_files), DOCS_DIR)

    all_questions: list[dict[str, Any]] = []
    for pdf_file in pdf_files:
        pdf_path = os.path.join(DOCS_DIR, pdf_file)
        logger.info("Processing %s ...", pdf_file)
        try:
            pages = parser.parse(pdf_path)
        except Exception as exc:
            logger.error("Failed to parse %s: %s", pdf_file, exc)
            continue

        sections = _section_pages(pages, _SECTION_CHAR_TARGET)
        sections = sections[: args.max_sections_per_doc]
        logger.info("  -> %d sections", len(sections))

        for i, section in enumerate(sections, start=1):
            logger.info(
                "  Section %d/%d (pages %d-%d)",
                i,
                len(sections),
                section["page_start"],
                section["page_end"],
            )
            qa_items = _generate_for_section(
                llm=llm,
                source_doc=pdf_file,
                section=section,
                questions_per_section=args.questions_per_section,
            )
            verified = sum(1 for q in qa_items if q["quote_verified"])
            logger.info("    -> %d questions (%d verified)", len(qa_items), verified)
            all_questions.extend(qa_items)

    out_path = Path(args.output)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    payload = {
        "meta": {
            "generator": "scripts/generate_qa_set.py",
            "llm_provider": settings.llm_provider,
            "llm_model": model_label,
            "total_candidates": len(all_questions),
            "verified_quotes": sum(1 for q in all_questions if q["quote_verified"]),
        },
        "questions": all_questions,
    }

    with open(out_path, "w", encoding="utf-8") as fh:
        yaml.safe_dump(
            payload,
            fh,
            allow_unicode=True,
            sort_keys=False,
            default_flow_style=False,
            width=100,
        )

    logger.info("=== QA Draft Generation Complete ===")
    print(f"\nDraft written to: {out_path}")
    print(f"Total questions:  {len(all_questions)}")
    print(
        f"Verified quotes:  {sum(1 for q in all_questions if q['quote_verified'])} / {len(all_questions)}"
    )
    print("\nNext steps:")
    print("  1. Open the YAML and review each entry.")
    print("  2. Set `reviewed: true` on entries you want to keep.")
    print("  3. Edit any field that needs fixing.")
    print("  4. Save the curated set as eval/qa_set.yaml when done.")


if __name__ == "__main__":
    main()