Spaces:
Running
Running
File size: 15,838 Bytes
a493f04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 | """Auto-generate an English QA evaluation test set from Danish PDF documents.
Calls an LLM (default: Qwen3-32B via Groq) to produce question / reference /
source-quote triples grounded in the source PDFs. The generated draft is
written to ``eval/qa_set_draft.yaml`` for human review and curation into the
final ``eval/qa_set.yaml``.
Each generated entry contains:
question: English question
reference_en: English reference answer (1–3 sentences)
source_quote_da: Verbatim Danish substring of the PDF page text
source_doc: PDF filename
source_page_start: First page of the section
source_page_end: Last page of the section
category: "fact" | "procedural" | "definition"
quote_verified: True if the Danish quote was found verbatim in the PDF
reviewed: Set to True manually after human review
Usage:
python -m scripts.generate_qa_set [--max-sections-per-doc 3]
[--questions-per-section 2]
Env vars (.env):
LLM_PROVIDER=groq
GROQ_API_KEY=gsk_...
GROQ_MODEL=qwen/qwen3-32b
"""
import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any
import yaml
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage, SystemMessage
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.insert(0, PROJECT_ROOT)
from src.config import load_settings # noqa: E402
from src.ingestion.pdf_parser import PDFParser # noqa: E402
from src.ingestion.text_cleaner import TextCleaner # noqa: E402
from src.provider import create_llm # noqa: E402
logger = logging.getLogger(__name__)
DOCS_DIR = os.path.join(PROJECT_ROOT, "docs")
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "eval", "qa_set_draft.yaml")
# Approximate character budget per section sent to the LLM. ~8K chars of
# Danish text is roughly 2.5K tokens, well within Qwen3-32B's 131K window
# even with a verbose system prompt.
_SECTION_CHAR_TARGET = 8000
_SYSTEM_PROMPT = """You are a question generator for a multilingual RAG \
evaluation test set.
You are given a section of a Danish university policy / regulation document. \
Your task is to generate factual question / answer pairs that test whether a \
retrieval system can find the right passage and answer the question correctly.
Strict rules:
1. Questions must be in ENGLISH (the test set targets English-speaking users \
querying Danish documents).
2. Each `reference_en` must be in ENGLISH and faithful to the source — do \
not add information not present in the text.
3. Each `source_quote_da` MUST be an EXACT verbatim substring of the source \
text I gave you. Do not paraphrase, do not translate, do not summarize. \
If you cannot find a clean verbatim quote that supports the answer, do \
not generate that question.
4. Quote length: keep `source_quote_da` between 30 and 400 characters — long \
enough to fully support the answer, short enough to be specific.
5. Categories: each question must be tagged with one of:
- "fact": single specific fact (a number, a deadline, a definition, a rule)
- "procedural": describes a process or sequence of steps
- "definition": defines a term or concept
6. Avoid trivially generic questions like "What does the document say about \
X?" — questions should be answerable in 1-3 sentences with specific content.
7. Output STRICT JSON only, no markdown, no commentary, no thinking. Schema:
{
"questions": [
{
"question": "string (English)",
"reference_en": "string (English, 1-3 sentences)",
"source_quote_da": "string (verbatim Danish substring of input)",
"category": "fact" | "procedural" | "definition"
}
]
}
"""
def _section_pages(
pages: list[dict[str, str | int]], target_chars: int
) -> list[dict[str, Any]]:
"""Group consecutive pages into sections of approximately target_chars.
Args:
pages: List of page dicts from PDFParser.
target_chars: Approximate character budget per section.
Returns:
List of section dicts with 'text', 'page_start', 'page_end'.
"""
cleaner = TextCleaner()
sections: list[dict[str, Any]] = []
buf: list[str] = []
buf_pages: list[int] = []
buf_chars = 0
for page in pages:
cleaned = cleaner.clean(str(page["text"]))
cleaned = cleaner.remove_headers_footers(cleaned)
if not cleaned.strip():
continue
page_no = int(page["page_number"])
buf.append(cleaned)
buf_pages.append(page_no)
buf_chars += len(cleaned)
if buf_chars >= target_chars:
sections.append(
{
"text": "\n\n".join(buf),
"page_start": buf_pages[0],
"page_end": buf_pages[-1],
}
)
buf, buf_pages, buf_chars = [], [], 0
if buf:
sections.append(
{
"text": "\n\n".join(buf),
"page_start": buf_pages[0],
"page_end": buf_pages[-1],
}
)
return sections
# Soft-hyphen line break: a hyphen at end of line followed by a lowercase
# letter is PDF reflow artefact (e.g. "dæknings-\nområde"), not a real hyphen.
_HYPHEN_LINEBREAK_RE = re.compile(r"-\s*\n\s*(?=[a-zæøå])")
# Quote and bullet glyphs whose presence differs between PDF text extraction
# and LLM-generated quotes. Stripping them entirely makes matching robust to
# straight-vs-curly quotes and to bullets that PyMuPDF drops on extraction.
_STRIP_CHARS = (
"\u2018\u2019\u201a\u201b" # single curly quotes
"\u201c\u201d\u201e\u201f" # double curly quotes
"\u00ab\u00bb" # « »
"'\"" # straight quotes
"\u2022\u2023\u00b7\u25aa\u25e6\u25cf\u25cb" # bullet glyphs
)
_STRIP_TRANSLATE = str.maketrans({c: "" for c in _STRIP_CHARS})
# Word/Wingdings list bullets land in the Unicode Private Use Area (e.g.
# U+F0B7) when extracted from PDFs. Drop the whole BMP PUA range.
_PUA_RE = re.compile(r"[\ue000-\uf8ff]")
def _normalize_for_match(s: str) -> str:
"""Normalize text for tolerant verbatim-quote matching.
Heals PDF-style soft hyphens at line breaks (e.g. ``dæknings-\\nområde``
→ ``dækningsområde``), removes quote characters whose straight/curly
variants differ between PDF extraction and LLM output, drops bullet
glyphs that PDF extraction tends to discard, and collapses whitespace
runs to a single space.
Args:
s: Input string.
Returns:
Normalized string suitable for substring comparison.
"""
s = _HYPHEN_LINEBREAK_RE.sub("", s)
s = _PUA_RE.sub("", s)
s = s.translate(_STRIP_TRANSLATE)
return re.sub(r"\s+", " ", s).strip()
def _verify_quote(quote: str, source_text: str) -> bool:
"""Verify the quote is a verbatim substring of source_text after whitespace normalization.
Args:
quote: Candidate Danish quote produced by the LLM.
source_text: Full source text the quote should originate from.
Returns:
True if the quote is found verbatim (modulo whitespace) in the source.
"""
return _normalize_for_match(quote) in _normalize_for_match(source_text)
def _parse_llm_json(raw: str) -> dict[str, Any]:
"""Extract a JSON object from an LLM response, tolerating code fences and prose.
Args:
raw: Raw LLM output string.
Returns:
Parsed JSON dict.
Raises:
ValueError: If no valid JSON object could be extracted.
"""
text = raw.strip()
fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if fence:
text = fence.group(1)
else:
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
text = text[start : end + 1]
try:
return json.loads(text)
except json.JSONDecodeError as exc:
raise ValueError(
f"LLM returned invalid JSON: {exc}\nRaw (first 500 chars): {raw[:500]}"
) from exc
def _generate_for_section(
llm: BaseChatModel,
source_doc: str,
section: dict[str, Any],
questions_per_section: int,
) -> list[dict[str, Any]]:
"""Call the LLM to generate QA pairs for one document section.
Args:
llm: LangChain BaseChatModel instance.
source_doc: Filename of the source document.
section: Section dict with text, page_start, page_end.
questions_per_section: Target number of questions to generate.
Returns:
List of QA dicts (may be empty if the LLM fails or no quotes verify).
"""
user_prompt = (
f"Generate exactly {questions_per_section} question/answer pairs from "
f"the following Danish text. Remember: questions and reference answers "
f"in ENGLISH, source_quote_da must be VERBATIM from the text below.\n\n"
f"--- SOURCE TEXT ---\n{section['text']}\n--- END SOURCE TEXT ---"
)
messages = [
SystemMessage(content=_SYSTEM_PROMPT),
HumanMessage(content=user_prompt),
]
try:
response = llm.invoke(messages)
raw = response.content if hasattr(response, "content") else str(response)
except Exception as exc:
logger.error(
"LLM call failed for %s pages %d-%d: %s",
source_doc,
section["page_start"],
section["page_end"],
exc,
)
return []
if not isinstance(raw, str):
raw = str(raw)
try:
parsed = _parse_llm_json(raw)
except ValueError as exc:
logger.warning("Failed to parse LLM JSON for %s: %s", source_doc, exc)
return []
items = parsed.get("questions", [])
if not isinstance(items, list):
logger.warning("LLM 'questions' field is not a list for %s", source_doc)
return []
result: list[dict[str, Any]] = []
for item in items:
if not isinstance(item, dict):
continue
question = item.get("question")
reference_en = item.get("reference_en")
source_quote_da = item.get("source_quote_da")
category = item.get("category", "fact")
if not (
isinstance(question, str)
and isinstance(reference_en, str)
and isinstance(source_quote_da, str)
):
continue
if not (question.strip() and reference_en.strip() and source_quote_da.strip()):
continue
verified = _verify_quote(source_quote_da, section["text"])
result.append(
{
"question": question.strip(),
"reference_en": reference_en.strip(),
"source_quote_da": source_quote_da.strip(),
"source_doc": source_doc,
"source_page_start": section["page_start"],
"source_page_end": section["page_end"],
"category": category if category in {"fact", "procedural", "definition"} else "fact",
"quote_verified": verified,
"reviewed": False,
}
)
return result
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments.
Returns:
Parsed argument namespace.
"""
parser = argparse.ArgumentParser(
description="Auto-generate an English QA test set from Danish PDFs.",
)
parser.add_argument(
"--max-sections-per-doc",
type=int,
default=3,
help="Max sections to process per PDF (caps total questions; default 3).",
)
parser.add_argument(
"--questions-per-section",
type=int,
default=2,
help="Number of QA pairs to request per section (default 2).",
)
parser.add_argument(
"--output",
type=str,
default=OUTPUT_PATH,
help=f"Output YAML path (default: {OUTPUT_PATH}).",
)
parser.add_argument(
"--limit-docs",
type=int,
default=0,
help="Process at most N PDFs (0 = all). Useful for smoke testing.",
)
return parser.parse_args()
def main() -> None:
"""Generate the QA draft and write it to YAML."""
args = parse_args()
settings = load_settings()
logging.basicConfig(
level=getattr(logging, settings.log_level.upper(), logging.INFO),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
if settings.llm_provider != "groq":
logger.warning(
"LLM_PROVIDER is '%s', not 'groq'. The QA generator works with any "
"provider but Qwen3-32B via Groq is recommended.",
settings.llm_provider,
)
model_label = (
settings.groq_model if settings.llm_provider == "groq" else settings.generation_model
)
logger.info("=== QA Draft Generation Start ===")
logger.info("LLM provider: %s | model: %s", settings.llm_provider, model_label)
llm = create_llm(settings)
parser = PDFParser()
pdf_files = sorted(f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf"))
if args.limit_docs > 0:
pdf_files = pdf_files[: args.limit_docs]
logger.info("Found %d PDFs in %s", len(pdf_files), DOCS_DIR)
all_questions: list[dict[str, Any]] = []
for pdf_file in pdf_files:
pdf_path = os.path.join(DOCS_DIR, pdf_file)
logger.info("Processing %s ...", pdf_file)
try:
pages = parser.parse(pdf_path)
except Exception as exc:
logger.error("Failed to parse %s: %s", pdf_file, exc)
continue
sections = _section_pages(pages, _SECTION_CHAR_TARGET)
sections = sections[: args.max_sections_per_doc]
logger.info(" -> %d sections", len(sections))
for i, section in enumerate(sections, start=1):
logger.info(
" Section %d/%d (pages %d-%d)",
i,
len(sections),
section["page_start"],
section["page_end"],
)
qa_items = _generate_for_section(
llm=llm,
source_doc=pdf_file,
section=section,
questions_per_section=args.questions_per_section,
)
verified = sum(1 for q in qa_items if q["quote_verified"])
logger.info(" -> %d questions (%d verified)", len(qa_items), verified)
all_questions.extend(qa_items)
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"meta": {
"generator": "scripts/generate_qa_set.py",
"llm_provider": settings.llm_provider,
"llm_model": model_label,
"total_candidates": len(all_questions),
"verified_quotes": sum(1 for q in all_questions if q["quote_verified"]),
},
"questions": all_questions,
}
with open(out_path, "w", encoding="utf-8") as fh:
yaml.safe_dump(
payload,
fh,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
width=100,
)
logger.info("=== QA Draft Generation Complete ===")
print(f"\nDraft written to: {out_path}")
print(f"Total questions: {len(all_questions)}")
print(
f"Verified quotes: {sum(1 for q in all_questions if q['quote_verified'])} / {len(all_questions)}"
)
print("\nNext steps:")
print(" 1. Open the YAML and review each entry.")
print(" 2. Set `reviewed: true` on entries you want to keep.")
print(" 3. Edit any field that needs fixing.")
print(" 4. Save the curated set as eval/qa_set.yaml when done.")
if __name__ == "__main__":
main()
|