Spaces:
Running
Running
| """Auto-generate an English QA evaluation test set from Danish PDF documents. | |
| Calls an LLM (default: Qwen3-32B via Groq) to produce question / reference / | |
| source-quote triples grounded in the source PDFs. The generated draft is | |
| written to ``eval/qa_set_draft.yaml`` for human review and curation into the | |
| final ``eval/qa_set.yaml``. | |
| Each generated entry contains: | |
| question: English question | |
| reference_en: English reference answer (1–3 sentences) | |
| source_quote_da: Verbatim Danish substring of the PDF page text | |
| source_doc: PDF filename | |
| source_page_start: First page of the section | |
| source_page_end: Last page of the section | |
| category: "fact" | "procedural" | "definition" | |
| quote_verified: True if the Danish quote was found verbatim in the PDF | |
| reviewed: Set to True manually after human review | |
| Usage: | |
| python -m scripts.generate_qa_set [--max-sections-per-doc 3] | |
| [--questions-per-section 2] | |
| Env vars (.env): | |
| LLM_PROVIDER=groq | |
| GROQ_API_KEY=gsk_... | |
| GROQ_MODEL=qwen/qwen3-32b | |
| """ | |
| import argparse | |
| import json | |
| import logging | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| import yaml | |
| from langchain_core.language_models.chat_models import BaseChatModel | |
| from langchain_core.messages import HumanMessage, SystemMessage | |
| PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
| sys.path.insert(0, PROJECT_ROOT) | |
| from src.config import load_settings # noqa: E402 | |
| from src.ingestion.pdf_parser import PDFParser # noqa: E402 | |
| from src.ingestion.text_cleaner import TextCleaner # noqa: E402 | |
| from src.provider import create_llm # noqa: E402 | |
| logger = logging.getLogger(__name__) | |
| DOCS_DIR = os.path.join(PROJECT_ROOT, "docs") | |
| OUTPUT_PATH = os.path.join(PROJECT_ROOT, "eval", "qa_set_draft.yaml") | |
| # Approximate character budget per section sent to the LLM. ~8K chars of | |
| # Danish text is roughly 2.5K tokens, well within Qwen3-32B's 131K window | |
| # even with a verbose system prompt. | |
| _SECTION_CHAR_TARGET = 8000 | |
| _SYSTEM_PROMPT = """You are a question generator for a multilingual RAG \ | |
| evaluation test set. | |
| You are given a section of a Danish university policy / regulation document. \ | |
| Your task is to generate factual question / answer pairs that test whether a \ | |
| retrieval system can find the right passage and answer the question correctly. | |
| Strict rules: | |
| 1. Questions must be in ENGLISH (the test set targets English-speaking users \ | |
| querying Danish documents). | |
| 2. Each `reference_en` must be in ENGLISH and faithful to the source — do \ | |
| not add information not present in the text. | |
| 3. Each `source_quote_da` MUST be an EXACT verbatim substring of the source \ | |
| text I gave you. Do not paraphrase, do not translate, do not summarize. \ | |
| If you cannot find a clean verbatim quote that supports the answer, do \ | |
| not generate that question. | |
| 4. Quote length: keep `source_quote_da` between 30 and 400 characters — long \ | |
| enough to fully support the answer, short enough to be specific. | |
| 5. Categories: each question must be tagged with one of: | |
| - "fact": single specific fact (a number, a deadline, a definition, a rule) | |
| - "procedural": describes a process or sequence of steps | |
| - "definition": defines a term or concept | |
| 6. Avoid trivially generic questions like "What does the document say about \ | |
| X?" — questions should be answerable in 1-3 sentences with specific content. | |
| 7. Output STRICT JSON only, no markdown, no commentary, no thinking. Schema: | |
| { | |
| "questions": [ | |
| { | |
| "question": "string (English)", | |
| "reference_en": "string (English, 1-3 sentences)", | |
| "source_quote_da": "string (verbatim Danish substring of input)", | |
| "category": "fact" | "procedural" | "definition" | |
| } | |
| ] | |
| } | |
| """ | |
| def _section_pages( | |
| pages: list[dict[str, str | int]], target_chars: int | |
| ) -> list[dict[str, Any]]: | |
| """Group consecutive pages into sections of approximately target_chars. | |
| Args: | |
| pages: List of page dicts from PDFParser. | |
| target_chars: Approximate character budget per section. | |
| Returns: | |
| List of section dicts with 'text', 'page_start', 'page_end'. | |
| """ | |
| cleaner = TextCleaner() | |
| sections: list[dict[str, Any]] = [] | |
| buf: list[str] = [] | |
| buf_pages: list[int] = [] | |
| buf_chars = 0 | |
| for page in pages: | |
| cleaned = cleaner.clean(str(page["text"])) | |
| cleaned = cleaner.remove_headers_footers(cleaned) | |
| if not cleaned.strip(): | |
| continue | |
| page_no = int(page["page_number"]) | |
| buf.append(cleaned) | |
| buf_pages.append(page_no) | |
| buf_chars += len(cleaned) | |
| if buf_chars >= target_chars: | |
| sections.append( | |
| { | |
| "text": "\n\n".join(buf), | |
| "page_start": buf_pages[0], | |
| "page_end": buf_pages[-1], | |
| } | |
| ) | |
| buf, buf_pages, buf_chars = [], [], 0 | |
| if buf: | |
| sections.append( | |
| { | |
| "text": "\n\n".join(buf), | |
| "page_start": buf_pages[0], | |
| "page_end": buf_pages[-1], | |
| } | |
| ) | |
| return sections | |
| # Soft-hyphen line break: a hyphen at end of line followed by a lowercase | |
| # letter is PDF reflow artefact (e.g. "dæknings-\nområde"), not a real hyphen. | |
| _HYPHEN_LINEBREAK_RE = re.compile(r"-\s*\n\s*(?=[a-zæøå])") | |
| # Quote and bullet glyphs whose presence differs between PDF text extraction | |
| # and LLM-generated quotes. Stripping them entirely makes matching robust to | |
| # straight-vs-curly quotes and to bullets that PyMuPDF drops on extraction. | |
| _STRIP_CHARS = ( | |
| "\u2018\u2019\u201a\u201b" # single curly quotes | |
| "\u201c\u201d\u201e\u201f" # double curly quotes | |
| "\u00ab\u00bb" # « » | |
| "'\"" # straight quotes | |
| "\u2022\u2023\u00b7\u25aa\u25e6\u25cf\u25cb" # bullet glyphs | |
| ) | |
| _STRIP_TRANSLATE = str.maketrans({c: "" for c in _STRIP_CHARS}) | |
| # Word/Wingdings list bullets land in the Unicode Private Use Area (e.g. | |
| # U+F0B7) when extracted from PDFs. Drop the whole BMP PUA range. | |
| _PUA_RE = re.compile(r"[\ue000-\uf8ff]") | |
| def _normalize_for_match(s: str) -> str: | |
| """Normalize text for tolerant verbatim-quote matching. | |
| Heals PDF-style soft hyphens at line breaks (e.g. ``dæknings-\\nområde`` | |
| → ``dækningsområde``), removes quote characters whose straight/curly | |
| variants differ between PDF extraction and LLM output, drops bullet | |
| glyphs that PDF extraction tends to discard, and collapses whitespace | |
| runs to a single space. | |
| Args: | |
| s: Input string. | |
| Returns: | |
| Normalized string suitable for substring comparison. | |
| """ | |
| s = _HYPHEN_LINEBREAK_RE.sub("", s) | |
| s = _PUA_RE.sub("", s) | |
| s = s.translate(_STRIP_TRANSLATE) | |
| return re.sub(r"\s+", " ", s).strip() | |
| def _verify_quote(quote: str, source_text: str) -> bool: | |
| """Verify the quote is a verbatim substring of source_text after whitespace normalization. | |
| Args: | |
| quote: Candidate Danish quote produced by the LLM. | |
| source_text: Full source text the quote should originate from. | |
| Returns: | |
| True if the quote is found verbatim (modulo whitespace) in the source. | |
| """ | |
| return _normalize_for_match(quote) in _normalize_for_match(source_text) | |
| def _parse_llm_json(raw: str) -> dict[str, Any]: | |
| """Extract a JSON object from an LLM response, tolerating code fences and prose. | |
| Args: | |
| raw: Raw LLM output string. | |
| Returns: | |
| Parsed JSON dict. | |
| Raises: | |
| ValueError: If no valid JSON object could be extracted. | |
| """ | |
| text = raw.strip() | |
| fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) | |
| if fence: | |
| text = fence.group(1) | |
| else: | |
| start = text.find("{") | |
| end = text.rfind("}") | |
| if start >= 0 and end > start: | |
| text = text[start : end + 1] | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError( | |
| f"LLM returned invalid JSON: {exc}\nRaw (first 500 chars): {raw[:500]}" | |
| ) from exc | |
| def _generate_for_section( | |
| llm: BaseChatModel, | |
| source_doc: str, | |
| section: dict[str, Any], | |
| questions_per_section: int, | |
| ) -> list[dict[str, Any]]: | |
| """Call the LLM to generate QA pairs for one document section. | |
| Args: | |
| llm: LangChain BaseChatModel instance. | |
| source_doc: Filename of the source document. | |
| section: Section dict with text, page_start, page_end. | |
| questions_per_section: Target number of questions to generate. | |
| Returns: | |
| List of QA dicts (may be empty if the LLM fails or no quotes verify). | |
| """ | |
| user_prompt = ( | |
| f"Generate exactly {questions_per_section} question/answer pairs from " | |
| f"the following Danish text. Remember: questions and reference answers " | |
| f"in ENGLISH, source_quote_da must be VERBATIM from the text below.\n\n" | |
| f"--- SOURCE TEXT ---\n{section['text']}\n--- END SOURCE TEXT ---" | |
| ) | |
| messages = [ | |
| SystemMessage(content=_SYSTEM_PROMPT), | |
| HumanMessage(content=user_prompt), | |
| ] | |
| try: | |
| response = llm.invoke(messages) | |
| raw = response.content if hasattr(response, "content") else str(response) | |
| except Exception as exc: | |
| logger.error( | |
| "LLM call failed for %s pages %d-%d: %s", | |
| source_doc, | |
| section["page_start"], | |
| section["page_end"], | |
| exc, | |
| ) | |
| return [] | |
| if not isinstance(raw, str): | |
| raw = str(raw) | |
| try: | |
| parsed = _parse_llm_json(raw) | |
| except ValueError as exc: | |
| logger.warning("Failed to parse LLM JSON for %s: %s", source_doc, exc) | |
| return [] | |
| items = parsed.get("questions", []) | |
| if not isinstance(items, list): | |
| logger.warning("LLM 'questions' field is not a list for %s", source_doc) | |
| return [] | |
| result: list[dict[str, Any]] = [] | |
| for item in items: | |
| if not isinstance(item, dict): | |
| continue | |
| question = item.get("question") | |
| reference_en = item.get("reference_en") | |
| source_quote_da = item.get("source_quote_da") | |
| category = item.get("category", "fact") | |
| if not ( | |
| isinstance(question, str) | |
| and isinstance(reference_en, str) | |
| and isinstance(source_quote_da, str) | |
| ): | |
| continue | |
| if not (question.strip() and reference_en.strip() and source_quote_da.strip()): | |
| continue | |
| verified = _verify_quote(source_quote_da, section["text"]) | |
| result.append( | |
| { | |
| "question": question.strip(), | |
| "reference_en": reference_en.strip(), | |
| "source_quote_da": source_quote_da.strip(), | |
| "source_doc": source_doc, | |
| "source_page_start": section["page_start"], | |
| "source_page_end": section["page_end"], | |
| "category": category if category in {"fact", "procedural", "definition"} else "fact", | |
| "quote_verified": verified, | |
| "reviewed": False, | |
| } | |
| ) | |
| return result | |
| def parse_args() -> argparse.Namespace: | |
| """Parse command-line arguments. | |
| Returns: | |
| Parsed argument namespace. | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description="Auto-generate an English QA test set from Danish PDFs.", | |
| ) | |
| parser.add_argument( | |
| "--max-sections-per-doc", | |
| type=int, | |
| default=3, | |
| help="Max sections to process per PDF (caps total questions; default 3).", | |
| ) | |
| parser.add_argument( | |
| "--questions-per-section", | |
| type=int, | |
| default=2, | |
| help="Number of QA pairs to request per section (default 2).", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| type=str, | |
| default=OUTPUT_PATH, | |
| help=f"Output YAML path (default: {OUTPUT_PATH}).", | |
| ) | |
| parser.add_argument( | |
| "--limit-docs", | |
| type=int, | |
| default=0, | |
| help="Process at most N PDFs (0 = all). Useful for smoke testing.", | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| """Generate the QA draft and write it to YAML.""" | |
| args = parse_args() | |
| settings = load_settings() | |
| logging.basicConfig( | |
| level=getattr(logging, settings.log_level.upper(), logging.INFO), | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| ) | |
| if settings.llm_provider != "groq": | |
| logger.warning( | |
| "LLM_PROVIDER is '%s', not 'groq'. The QA generator works with any " | |
| "provider but Qwen3-32B via Groq is recommended.", | |
| settings.llm_provider, | |
| ) | |
| model_label = ( | |
| settings.groq_model if settings.llm_provider == "groq" else settings.generation_model | |
| ) | |
| logger.info("=== QA Draft Generation Start ===") | |
| logger.info("LLM provider: %s | model: %s", settings.llm_provider, model_label) | |
| llm = create_llm(settings) | |
| parser = PDFParser() | |
| pdf_files = sorted(f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf")) | |
| if args.limit_docs > 0: | |
| pdf_files = pdf_files[: args.limit_docs] | |
| logger.info("Found %d PDFs in %s", len(pdf_files), DOCS_DIR) | |
| all_questions: list[dict[str, Any]] = [] | |
| for pdf_file in pdf_files: | |
| pdf_path = os.path.join(DOCS_DIR, pdf_file) | |
| logger.info("Processing %s ...", pdf_file) | |
| try: | |
| pages = parser.parse(pdf_path) | |
| except Exception as exc: | |
| logger.error("Failed to parse %s: %s", pdf_file, exc) | |
| continue | |
| sections = _section_pages(pages, _SECTION_CHAR_TARGET) | |
| sections = sections[: args.max_sections_per_doc] | |
| logger.info(" -> %d sections", len(sections)) | |
| for i, section in enumerate(sections, start=1): | |
| logger.info( | |
| " Section %d/%d (pages %d-%d)", | |
| i, | |
| len(sections), | |
| section["page_start"], | |
| section["page_end"], | |
| ) | |
| qa_items = _generate_for_section( | |
| llm=llm, | |
| source_doc=pdf_file, | |
| section=section, | |
| questions_per_section=args.questions_per_section, | |
| ) | |
| verified = sum(1 for q in qa_items if q["quote_verified"]) | |
| logger.info(" -> %d questions (%d verified)", len(qa_items), verified) | |
| all_questions.extend(qa_items) | |
| out_path = Path(args.output) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| payload = { | |
| "meta": { | |
| "generator": "scripts/generate_qa_set.py", | |
| "llm_provider": settings.llm_provider, | |
| "llm_model": model_label, | |
| "total_candidates": len(all_questions), | |
| "verified_quotes": sum(1 for q in all_questions if q["quote_verified"]), | |
| }, | |
| "questions": all_questions, | |
| } | |
| with open(out_path, "w", encoding="utf-8") as fh: | |
| yaml.safe_dump( | |
| payload, | |
| fh, | |
| allow_unicode=True, | |
| sort_keys=False, | |
| default_flow_style=False, | |
| width=100, | |
| ) | |
| logger.info("=== QA Draft Generation Complete ===") | |
| print(f"\nDraft written to: {out_path}") | |
| print(f"Total questions: {len(all_questions)}") | |
| print( | |
| f"Verified quotes: {sum(1 for q in all_questions if q['quote_verified'])} / {len(all_questions)}" | |
| ) | |
| print("\nNext steps:") | |
| print(" 1. Open the YAML and review each entry.") | |
| print(" 2. Set `reviewed: true` on entries you want to keep.") | |
| print(" 3. Edit any field that needs fixing.") | |
| print(" 4. Save the curated set as eval/qa_set.yaml when done.") | |
| if __name__ == "__main__": | |
| main() | |