Dokumentassistent / scripts /generate_qa_set.py
XQ
Add evaluation and update README
a493f04
raw
history blame
15.8 kB
"""Auto-generate an English QA evaluation test set from Danish PDF documents.
Calls an LLM (default: Qwen3-32B via Groq) to produce question / reference /
source-quote triples grounded in the source PDFs. The generated draft is
written to ``eval/qa_set_draft.yaml`` for human review and curation into the
final ``eval/qa_set.yaml``.
Each generated entry contains:
question: English question
reference_en: English reference answer (1–3 sentences)
source_quote_da: Verbatim Danish substring of the PDF page text
source_doc: PDF filename
source_page_start: First page of the section
source_page_end: Last page of the section
category: "fact" | "procedural" | "definition"
quote_verified: True if the Danish quote was found verbatim in the PDF
reviewed: Set to True manually after human review
Usage:
python -m scripts.generate_qa_set [--max-sections-per-doc 3]
[--questions-per-section 2]
Env vars (.env):
LLM_PROVIDER=groq
GROQ_API_KEY=gsk_...
GROQ_MODEL=qwen/qwen3-32b
"""
import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any
import yaml
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage, SystemMessage
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.insert(0, PROJECT_ROOT)
from src.config import load_settings # noqa: E402
from src.ingestion.pdf_parser import PDFParser # noqa: E402
from src.ingestion.text_cleaner import TextCleaner # noqa: E402
from src.provider import create_llm # noqa: E402
logger = logging.getLogger(__name__)
DOCS_DIR = os.path.join(PROJECT_ROOT, "docs")
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "eval", "qa_set_draft.yaml")
# Approximate character budget per section sent to the LLM. ~8K chars of
# Danish text is roughly 2.5K tokens, well within Qwen3-32B's 131K window
# even with a verbose system prompt.
_SECTION_CHAR_TARGET = 8000
_SYSTEM_PROMPT = """You are a question generator for a multilingual RAG \
evaluation test set.
You are given a section of a Danish university policy / regulation document. \
Your task is to generate factual question / answer pairs that test whether a \
retrieval system can find the right passage and answer the question correctly.
Strict rules:
1. Questions must be in ENGLISH (the test set targets English-speaking users \
querying Danish documents).
2. Each `reference_en` must be in ENGLISH and faithful to the source — do \
not add information not present in the text.
3. Each `source_quote_da` MUST be an EXACT verbatim substring of the source \
text I gave you. Do not paraphrase, do not translate, do not summarize. \
If you cannot find a clean verbatim quote that supports the answer, do \
not generate that question.
4. Quote length: keep `source_quote_da` between 30 and 400 characters — long \
enough to fully support the answer, short enough to be specific.
5. Categories: each question must be tagged with one of:
- "fact": single specific fact (a number, a deadline, a definition, a rule)
- "procedural": describes a process or sequence of steps
- "definition": defines a term or concept
6. Avoid trivially generic questions like "What does the document say about \
X?" — questions should be answerable in 1-3 sentences with specific content.
7. Output STRICT JSON only, no markdown, no commentary, no thinking. Schema:
{
"questions": [
{
"question": "string (English)",
"reference_en": "string (English, 1-3 sentences)",
"source_quote_da": "string (verbatim Danish substring of input)",
"category": "fact" | "procedural" | "definition"
}
]
}
"""
def _section_pages(
pages: list[dict[str, str | int]], target_chars: int
) -> list[dict[str, Any]]:
"""Group consecutive pages into sections of approximately target_chars.
Args:
pages: List of page dicts from PDFParser.
target_chars: Approximate character budget per section.
Returns:
List of section dicts with 'text', 'page_start', 'page_end'.
"""
cleaner = TextCleaner()
sections: list[dict[str, Any]] = []
buf: list[str] = []
buf_pages: list[int] = []
buf_chars = 0
for page in pages:
cleaned = cleaner.clean(str(page["text"]))
cleaned = cleaner.remove_headers_footers(cleaned)
if not cleaned.strip():
continue
page_no = int(page["page_number"])
buf.append(cleaned)
buf_pages.append(page_no)
buf_chars += len(cleaned)
if buf_chars >= target_chars:
sections.append(
{
"text": "\n\n".join(buf),
"page_start": buf_pages[0],
"page_end": buf_pages[-1],
}
)
buf, buf_pages, buf_chars = [], [], 0
if buf:
sections.append(
{
"text": "\n\n".join(buf),
"page_start": buf_pages[0],
"page_end": buf_pages[-1],
}
)
return sections
# Soft-hyphen line break: a hyphen at end of line followed by a lowercase
# letter is PDF reflow artefact (e.g. "dæknings-\nområde"), not a real hyphen.
_HYPHEN_LINEBREAK_RE = re.compile(r"-\s*\n\s*(?=[a-zæøå])")
# Quote and bullet glyphs whose presence differs between PDF text extraction
# and LLM-generated quotes. Stripping them entirely makes matching robust to
# straight-vs-curly quotes and to bullets that PyMuPDF drops on extraction.
_STRIP_CHARS = (
"\u2018\u2019\u201a\u201b" # single curly quotes
"\u201c\u201d\u201e\u201f" # double curly quotes
"\u00ab\u00bb" # « »
"'\"" # straight quotes
"\u2022\u2023\u00b7\u25aa\u25e6\u25cf\u25cb" # bullet glyphs
)
_STRIP_TRANSLATE = str.maketrans({c: "" for c in _STRIP_CHARS})
# Word/Wingdings list bullets land in the Unicode Private Use Area (e.g.
# U+F0B7) when extracted from PDFs. Drop the whole BMP PUA range.
_PUA_RE = re.compile(r"[\ue000-\uf8ff]")
def _normalize_for_match(s: str) -> str:
"""Normalize text for tolerant verbatim-quote matching.
Heals PDF-style soft hyphens at line breaks (e.g. ``dæknings-\\nområde``
→ ``dækningsområde``), removes quote characters whose straight/curly
variants differ between PDF extraction and LLM output, drops bullet
glyphs that PDF extraction tends to discard, and collapses whitespace
runs to a single space.
Args:
s: Input string.
Returns:
Normalized string suitable for substring comparison.
"""
s = _HYPHEN_LINEBREAK_RE.sub("", s)
s = _PUA_RE.sub("", s)
s = s.translate(_STRIP_TRANSLATE)
return re.sub(r"\s+", " ", s).strip()
def _verify_quote(quote: str, source_text: str) -> bool:
"""Verify the quote is a verbatim substring of source_text after whitespace normalization.
Args:
quote: Candidate Danish quote produced by the LLM.
source_text: Full source text the quote should originate from.
Returns:
True if the quote is found verbatim (modulo whitespace) in the source.
"""
return _normalize_for_match(quote) in _normalize_for_match(source_text)
def _parse_llm_json(raw: str) -> dict[str, Any]:
"""Extract a JSON object from an LLM response, tolerating code fences and prose.
Args:
raw: Raw LLM output string.
Returns:
Parsed JSON dict.
Raises:
ValueError: If no valid JSON object could be extracted.
"""
text = raw.strip()
fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if fence:
text = fence.group(1)
else:
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
text = text[start : end + 1]
try:
return json.loads(text)
except json.JSONDecodeError as exc:
raise ValueError(
f"LLM returned invalid JSON: {exc}\nRaw (first 500 chars): {raw[:500]}"
) from exc
def _generate_for_section(
llm: BaseChatModel,
source_doc: str,
section: dict[str, Any],
questions_per_section: int,
) -> list[dict[str, Any]]:
"""Call the LLM to generate QA pairs for one document section.
Args:
llm: LangChain BaseChatModel instance.
source_doc: Filename of the source document.
section: Section dict with text, page_start, page_end.
questions_per_section: Target number of questions to generate.
Returns:
List of QA dicts (may be empty if the LLM fails or no quotes verify).
"""
user_prompt = (
f"Generate exactly {questions_per_section} question/answer pairs from "
f"the following Danish text. Remember: questions and reference answers "
f"in ENGLISH, source_quote_da must be VERBATIM from the text below.\n\n"
f"--- SOURCE TEXT ---\n{section['text']}\n--- END SOURCE TEXT ---"
)
messages = [
SystemMessage(content=_SYSTEM_PROMPT),
HumanMessage(content=user_prompt),
]
try:
response = llm.invoke(messages)
raw = response.content if hasattr(response, "content") else str(response)
except Exception as exc:
logger.error(
"LLM call failed for %s pages %d-%d: %s",
source_doc,
section["page_start"],
section["page_end"],
exc,
)
return []
if not isinstance(raw, str):
raw = str(raw)
try:
parsed = _parse_llm_json(raw)
except ValueError as exc:
logger.warning("Failed to parse LLM JSON for %s: %s", source_doc, exc)
return []
items = parsed.get("questions", [])
if not isinstance(items, list):
logger.warning("LLM 'questions' field is not a list for %s", source_doc)
return []
result: list[dict[str, Any]] = []
for item in items:
if not isinstance(item, dict):
continue
question = item.get("question")
reference_en = item.get("reference_en")
source_quote_da = item.get("source_quote_da")
category = item.get("category", "fact")
if not (
isinstance(question, str)
and isinstance(reference_en, str)
and isinstance(source_quote_da, str)
):
continue
if not (question.strip() and reference_en.strip() and source_quote_da.strip()):
continue
verified = _verify_quote(source_quote_da, section["text"])
result.append(
{
"question": question.strip(),
"reference_en": reference_en.strip(),
"source_quote_da": source_quote_da.strip(),
"source_doc": source_doc,
"source_page_start": section["page_start"],
"source_page_end": section["page_end"],
"category": category if category in {"fact", "procedural", "definition"} else "fact",
"quote_verified": verified,
"reviewed": False,
}
)
return result
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments.
Returns:
Parsed argument namespace.
"""
parser = argparse.ArgumentParser(
description="Auto-generate an English QA test set from Danish PDFs.",
)
parser.add_argument(
"--max-sections-per-doc",
type=int,
default=3,
help="Max sections to process per PDF (caps total questions; default 3).",
)
parser.add_argument(
"--questions-per-section",
type=int,
default=2,
help="Number of QA pairs to request per section (default 2).",
)
parser.add_argument(
"--output",
type=str,
default=OUTPUT_PATH,
help=f"Output YAML path (default: {OUTPUT_PATH}).",
)
parser.add_argument(
"--limit-docs",
type=int,
default=0,
help="Process at most N PDFs (0 = all). Useful for smoke testing.",
)
return parser.parse_args()
def main() -> None:
"""Generate the QA draft and write it to YAML."""
args = parse_args()
settings = load_settings()
logging.basicConfig(
level=getattr(logging, settings.log_level.upper(), logging.INFO),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
if settings.llm_provider != "groq":
logger.warning(
"LLM_PROVIDER is '%s', not 'groq'. The QA generator works with any "
"provider but Qwen3-32B via Groq is recommended.",
settings.llm_provider,
)
model_label = (
settings.groq_model if settings.llm_provider == "groq" else settings.generation_model
)
logger.info("=== QA Draft Generation Start ===")
logger.info("LLM provider: %s | model: %s", settings.llm_provider, model_label)
llm = create_llm(settings)
parser = PDFParser()
pdf_files = sorted(f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf"))
if args.limit_docs > 0:
pdf_files = pdf_files[: args.limit_docs]
logger.info("Found %d PDFs in %s", len(pdf_files), DOCS_DIR)
all_questions: list[dict[str, Any]] = []
for pdf_file in pdf_files:
pdf_path = os.path.join(DOCS_DIR, pdf_file)
logger.info("Processing %s ...", pdf_file)
try:
pages = parser.parse(pdf_path)
except Exception as exc:
logger.error("Failed to parse %s: %s", pdf_file, exc)
continue
sections = _section_pages(pages, _SECTION_CHAR_TARGET)
sections = sections[: args.max_sections_per_doc]
logger.info(" -> %d sections", len(sections))
for i, section in enumerate(sections, start=1):
logger.info(
" Section %d/%d (pages %d-%d)",
i,
len(sections),
section["page_start"],
section["page_end"],
)
qa_items = _generate_for_section(
llm=llm,
source_doc=pdf_file,
section=section,
questions_per_section=args.questions_per_section,
)
verified = sum(1 for q in qa_items if q["quote_verified"])
logger.info(" -> %d questions (%d verified)", len(qa_items), verified)
all_questions.extend(qa_items)
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"meta": {
"generator": "scripts/generate_qa_set.py",
"llm_provider": settings.llm_provider,
"llm_model": model_label,
"total_candidates": len(all_questions),
"verified_quotes": sum(1 for q in all_questions if q["quote_verified"]),
},
"questions": all_questions,
}
with open(out_path, "w", encoding="utf-8") as fh:
yaml.safe_dump(
payload,
fh,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
width=100,
)
logger.info("=== QA Draft Generation Complete ===")
print(f"\nDraft written to: {out_path}")
print(f"Total questions: {len(all_questions)}")
print(
f"Verified quotes: {sum(1 for q in all_questions if q['quote_verified'])} / {len(all_questions)}"
)
print("\nNext steps:")
print(" 1. Open the YAML and review each entry.")
print(" 2. Set `reviewed: true` on entries you want to keep.")
print(" 3. Edit any field that needs fixing.")
print(" 4. Save the curated set as eval/qa_set.yaml when done.")
if __name__ == "__main__":
main()