Spaces:

XQ
/

Dokumentassistent

Running

Dokumentassistent / scripts /generate_qa_set.py

Add evaluation and update README

a493f04 about 1 month ago

15.8 kB

	"""Auto-generate an English QA evaluation test set from Danish PDF documents.

	Calls an LLM (default: Qwen3-32B via Groq) to produce question / reference /
	source-quote triples grounded in the source PDFs. The generated draft is
	written to ``eval/qa_set_draft.yaml`` for human review and curation into the
	final ``eval/qa_set.yaml``.

	Each generated entry contains:
	question: English question
	reference_en: English reference answer (1–3 sentences)
	source_quote_da: Verbatim Danish substring of the PDF page text
	source_doc: PDF filename
	source_page_start: First page of the section
	source_page_end: Last page of the section
	category: "fact" \| "procedural" \| "definition"
	quote_verified: True if the Danish quote was found verbatim in the PDF
	reviewed: Set to True manually after human review

	Usage:
	python -m scripts.generate_qa_set [--max-sections-per-doc 3]
	[--questions-per-section 2]

	Env vars (.env):
	LLM_PROVIDER=groq
	GROQ_API_KEY=gsk_...
	GROQ_MODEL=qwen/qwen3-32b
	"""

	import argparse
	import json
	import logging
	import os
	import re
	import sys
	from pathlib import Path
	from typing import Any

	import yaml
	from langchain_core.language_models.chat_models import BaseChatModel
	from langchain_core.messages import HumanMessage, SystemMessage

	PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
	sys.path.insert(0, PROJECT_ROOT)

	from src.config import load_settings # noqa: E402
	from src.ingestion.pdf_parser import PDFParser # noqa: E402
	from src.ingestion.text_cleaner import TextCleaner # noqa: E402
	from src.provider import create_llm # noqa: E402

	logger = logging.getLogger(__name__)

	DOCS_DIR = os.path.join(PROJECT_ROOT, "docs")
	OUTPUT_PATH = os.path.join(PROJECT_ROOT, "eval", "qa_set_draft.yaml")

	# Approximate character budget per section sent to the LLM. ~8K chars of
	# Danish text is roughly 2.5K tokens, well within Qwen3-32B's 131K window
	# even with a verbose system prompt.
	_SECTION_CHAR_TARGET = 8000


	_SYSTEM_PROMPT = """You are a question generator for a multilingual RAG \
	evaluation test set.

	You are given a section of a Danish university policy / regulation document. \
	Your task is to generate factual question / answer pairs that test whether a \
	retrieval system can find the right passage and answer the question correctly.

	Strict rules:
	1. Questions must be in ENGLISH (the test set targets English-speaking users \
	querying Danish documents).
	2. Each `reference_en` must be in ENGLISH and faithful to the source — do \
	not add information not present in the text.
	3. Each `source_quote_da` MUST be an EXACT verbatim substring of the source \
	text I gave you. Do not paraphrase, do not translate, do not summarize. \
	If you cannot find a clean verbatim quote that supports the answer, do \
	not generate that question.
	4. Quote length: keep `source_quote_da` between 30 and 400 characters — long \
	enough to fully support the answer, short enough to be specific.
	5. Categories: each question must be tagged with one of:
	- "fact": single specific fact (a number, a deadline, a definition, a rule)
	- "procedural": describes a process or sequence of steps
	- "definition": defines a term or concept
	6. Avoid trivially generic questions like "What does the document say about \
	X?" — questions should be answerable in 1-3 sentences with specific content.
	7. Output STRICT JSON only, no markdown, no commentary, no thinking. Schema:

	{
	"questions": [
	{
	"question": "string (English)",
	"reference_en": "string (English, 1-3 sentences)",
	"source_quote_da": "string (verbatim Danish substring of input)",
	"category": "fact" \| "procedural" \| "definition"
	}
	]
	}
	"""


	def _section_pages(
	pages: list[dict[str, str \| int]], target_chars: int
	) -> list[dict[str, Any]]:
	"""Group consecutive pages into sections of approximately target_chars.

	Args:
	pages: List of page dicts from PDFParser.
	target_chars: Approximate character budget per section.

	Returns:
	List of section dicts with 'text', 'page_start', 'page_end'.
	"""
	cleaner = TextCleaner()
	sections: list[dict[str, Any]] = []
	buf: list[str] = []
	buf_pages: list[int] = []
	buf_chars = 0

	for page in pages:
	cleaned = cleaner.clean(str(page["text"]))
	cleaned = cleaner.remove_headers_footers(cleaned)
	if not cleaned.strip():
	continue
	page_no = int(page["page_number"])
	buf.append(cleaned)
	buf_pages.append(page_no)
	buf_chars += len(cleaned)
	if buf_chars >= target_chars:
	sections.append(
	{
	"text": "\n\n".join(buf),
	"page_start": buf_pages[0],
	"page_end": buf_pages[-1],
	}
	)
	buf, buf_pages, buf_chars = [], [], 0

	if buf:
	sections.append(
	{
	"text": "\n\n".join(buf),
	"page_start": buf_pages[0],
	"page_end": buf_pages[-1],
	}
	)

	return sections


	# Soft-hyphen line break: a hyphen at end of line followed by a lowercase
	# letter is PDF reflow artefact (e.g. "dæknings-\nområde"), not a real hyphen.
	_HYPHEN_LINEBREAK_RE = re.compile(r"-\s\n\s(?=[a-zæøå])")

	# Quote and bullet glyphs whose presence differs between PDF text extraction
	# and LLM-generated quotes. Stripping them entirely makes matching robust to
	# straight-vs-curly quotes and to bullets that PyMuPDF drops on extraction.
	_STRIP_CHARS = (
	"\u2018\u2019\u201a\u201b" # single curly quotes
	"\u201c\u201d\u201e\u201f" # double curly quotes
	"\u00ab\u00bb" # « »
	"'\"" # straight quotes
	"\u2022\u2023\u00b7\u25aa\u25e6\u25cf\u25cb" # bullet glyphs
	)
	_STRIP_TRANSLATE = str.maketrans({c: "" for c in _STRIP_CHARS})

	# Word/Wingdings list bullets land in the Unicode Private Use Area (e.g.
	# U+F0B7) when extracted from PDFs. Drop the whole BMP PUA range.
	_PUA_RE = re.compile(r"[\ue000-\uf8ff]")


	def _normalize_for_match(s: str) -> str:
	"""Normalize text for tolerant verbatim-quote matching.

	Heals PDF-style soft hyphens at line breaks (e.g. ``dæknings-\\nområde``
	→ ``dækningsområde``), removes quote characters whose straight/curly
	variants differ between PDF extraction and LLM output, drops bullet
	glyphs that PDF extraction tends to discard, and collapses whitespace
	runs to a single space.

	Args:
	s: Input string.

	Returns:
	Normalized string suitable for substring comparison.
	"""
	s = _HYPHEN_LINEBREAK_RE.sub("", s)
	s = _PUA_RE.sub("", s)
	s = s.translate(_STRIP_TRANSLATE)
	return re.sub(r"\s+", " ", s).strip()


	def _verify_quote(quote: str, source_text: str) -> bool:
	"""Verify the quote is a verbatim substring of source_text after whitespace normalization.

	Args:
	quote: Candidate Danish quote produced by the LLM.
	source_text: Full source text the quote should originate from.

	Returns:
	True if the quote is found verbatim (modulo whitespace) in the source.
	"""
	return _normalize_for_match(quote) in _normalize_for_match(source_text)


	def _parse_llm_json(raw: str) -> dict[str, Any]:
	"""Extract a JSON object from an LLM response, tolerating code fences and prose.

	Args:
	raw: Raw LLM output string.

	Returns:
	Parsed JSON dict.

	Raises:
	ValueError: If no valid JSON object could be extracted.
	"""
	text = raw.strip()
	fence = re.search(r"```(?:json)?\s(\{.?\})\s*```", text, re.DOTALL)
	if fence:
	text = fence.group(1)
	else:
	start = text.find("{")
	end = text.rfind("}")
	if start >= 0 and end > start:
	text = text[start : end + 1]
	try:
	return json.loads(text)
	except json.JSONDecodeError as exc:
	raise ValueError(
	f"LLM returned invalid JSON: {exc}\nRaw (first 500 chars): {raw[:500]}"
	) from exc


	def _generate_for_section(
	llm: BaseChatModel,
	source_doc: str,
	section: dict[str, Any],
	questions_per_section: int,
	) -> list[dict[str, Any]]:
	"""Call the LLM to generate QA pairs for one document section.

	Args:
	llm: LangChain BaseChatModel instance.
	source_doc: Filename of the source document.
	section: Section dict with text, page_start, page_end.
	questions_per_section: Target number of questions to generate.

	Returns:
	List of QA dicts (may be empty if the LLM fails or no quotes verify).
	"""
	user_prompt = (
	f"Generate exactly {questions_per_section} question/answer pairs from "
	f"the following Danish text. Remember: questions and reference answers "
	f"in ENGLISH, source_quote_da must be VERBATIM from the text below.\n\n"
	f"--- SOURCE TEXT ---\n{section['text']}\n--- END SOURCE TEXT ---"
	)
	messages = [
	SystemMessage(content=_SYSTEM_PROMPT),
	HumanMessage(content=user_prompt),
	]
	try:
	response = llm.invoke(messages)
	raw = response.content if hasattr(response, "content") else str(response)
	except Exception as exc:
	logger.error(
	"LLM call failed for %s pages %d-%d: %s",
	source_doc,
	section["page_start"],
	section["page_end"],
	exc,
	)
	return []

	if not isinstance(raw, str):
	raw = str(raw)

	try:
	parsed = _parse_llm_json(raw)
	except ValueError as exc:
	logger.warning("Failed to parse LLM JSON for %s: %s", source_doc, exc)
	return []

	items = parsed.get("questions", [])
	if not isinstance(items, list):
	logger.warning("LLM 'questions' field is not a list for %s", source_doc)
	return []

	result: list[dict[str, Any]] = []
	for item in items:
	if not isinstance(item, dict):
	continue
	question = item.get("question")
	reference_en = item.get("reference_en")
	source_quote_da = item.get("source_quote_da")
	category = item.get("category", "fact")
	if not (
	isinstance(question, str)
	and isinstance(reference_en, str)
	and isinstance(source_quote_da, str)
	):
	continue
	if not (question.strip() and reference_en.strip() and source_quote_da.strip()):
	continue
	verified = _verify_quote(source_quote_da, section["text"])
	result.append(
	{
	"question": question.strip(),
	"reference_en": reference_en.strip(),
	"source_quote_da": source_quote_da.strip(),
	"source_doc": source_doc,
	"source_page_start": section["page_start"],
	"source_page_end": section["page_end"],
	"category": category if category in {"fact", "procedural", "definition"} else "fact",
	"quote_verified": verified,
	"reviewed": False,
	}
	)
	return result


	def parse_args() -> argparse.Namespace:
	"""Parse command-line arguments.

	Returns:
	Parsed argument namespace.
	"""
	parser = argparse.ArgumentParser(
	description="Auto-generate an English QA test set from Danish PDFs.",
	)
	parser.add_argument(
	"--max-sections-per-doc",
	type=int,
	default=3,
	help="Max sections to process per PDF (caps total questions; default 3).",
	)
	parser.add_argument(
	"--questions-per-section",
	type=int,
	default=2,
	help="Number of QA pairs to request per section (default 2).",
	)
	parser.add_argument(
	"--output",
	type=str,
	default=OUTPUT_PATH,
	help=f"Output YAML path (default: {OUTPUT_PATH}).",
	)
	parser.add_argument(
	"--limit-docs",
	type=int,
	default=0,
	help="Process at most N PDFs (0 = all). Useful for smoke testing.",
	)
	return parser.parse_args()


	def main() -> None:
	"""Generate the QA draft and write it to YAML."""
	args = parse_args()
	settings = load_settings()

	logging.basicConfig(
	level=getattr(logging, settings.log_level.upper(), logging.INFO),
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	)

	if settings.llm_provider != "groq":
	logger.warning(
	"LLM_PROVIDER is '%s', not 'groq'. The QA generator works with any "
	"provider but Qwen3-32B via Groq is recommended.",
	settings.llm_provider,
	)

	model_label = (
	settings.groq_model if settings.llm_provider == "groq" else settings.generation_model
	)
	logger.info("=== QA Draft Generation Start ===")
	logger.info("LLM provider: %s \| model: %s", settings.llm_provider, model_label)

	llm = create_llm(settings)
	parser = PDFParser()

	pdf_files = sorted(f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf"))
	if args.limit_docs > 0:
	pdf_files = pdf_files[: args.limit_docs]
	logger.info("Found %d PDFs in %s", len(pdf_files), DOCS_DIR)

	all_questions: list[dict[str, Any]] = []
	for pdf_file in pdf_files:
	pdf_path = os.path.join(DOCS_DIR, pdf_file)
	logger.info("Processing %s ...", pdf_file)
	try:
	pages = parser.parse(pdf_path)
	except Exception as exc:
	logger.error("Failed to parse %s: %s", pdf_file, exc)
	continue

	sections = _section_pages(pages, _SECTION_CHAR_TARGET)
	sections = sections[: args.max_sections_per_doc]
	logger.info(" -> %d sections", len(sections))

	for i, section in enumerate(sections, start=1):
	logger.info(
	" Section %d/%d (pages %d-%d)",
	i,
	len(sections),
	section["page_start"],
	section["page_end"],
	)
	qa_items = _generate_for_section(
	llm=llm,
	source_doc=pdf_file,
	section=section,
	questions_per_section=args.questions_per_section,
	)
	verified = sum(1 for q in qa_items if q["quote_verified"])
	logger.info(" -> %d questions (%d verified)", len(qa_items), verified)
	all_questions.extend(qa_items)

	out_path = Path(args.output)
	out_path.parent.mkdir(parents=True, exist_ok=True)

	payload = {
	"meta": {
	"generator": "scripts/generate_qa_set.py",
	"llm_provider": settings.llm_provider,
	"llm_model": model_label,
	"total_candidates": len(all_questions),
	"verified_quotes": sum(1 for q in all_questions if q["quote_verified"]),
	},
	"questions": all_questions,
	}

	with open(out_path, "w", encoding="utf-8") as fh:
	yaml.safe_dump(
	payload,
	fh,
	allow_unicode=True,
	sort_keys=False,
	default_flow_style=False,
	width=100,
	)

	logger.info("=== QA Draft Generation Complete ===")
	print(f"\nDraft written to: {out_path}")
	print(f"Total questions: {len(all_questions)}")
	print(
	f"Verified quotes: {sum(1 for q in all_questions if q['quote_verified'])} / {len(all_questions)}"
	)
	print("\nNext steps:")
	print(" 1. Open the YAML and review each entry.")
	print(" 2. Set `reviewed: true` on entries you want to keep.")
	print(" 3. Edit any field that needs fixing.")
	print(" 4. Save the curated set as eval/qa_set.yaml when done.")


	if __name__ == "__main__":
	main()