import re from typing import Any SKIP_LINES = { "q. no.", "questions", "marks", "cl", "co", "co no.", "co description", "cognitive", "level", "remember", "understand", "apply", "analyze", "analyse", "evaluate", "create", "course outcomes", "usn", "no.", } BLOOMS_MAP = { "cl1": "CL1", "cl2": "CL2", "cl3": "CL3", "cl4": "CL4", "cl5": "CL5", "cl6": "CL6", "btl1": "CL1", "btl2": "CL2", "btl3": "CL3", "btl4": "CL4", "btl5": "CL5", "btl6": "CL6", "l1": "CL1", "l2": "CL2", "l3": "CL3", "l4": "CL4", "l5": "CL5", "l6": "CL6", } IS_PART_HEADER = re.compile(r'^\s*PART\s+[A-Z]\s*$', re.IGNORECASE) IS_OR = re.compile(r'^\s*OR\s*$', re.IGNORECASE) IS_PAGE_BREAK = re.compile(r'---\s*PAGE BREAK\s*---', re.IGNORECASE) IS_QUESTION_NUM = re.compile(r'^\s*(\d{1,2})\s*$') IS_SUB_LABEL = re.compile(r'^\s*([a-e])\s*$', re.IGNORECASE) IS_BLOOMS = re.compile(r'^\s*(CL[1-6]|BTL[1-6]|L[1-6])\s*$', re.IGNORECASE) IS_CO = re.compile(r'^\s*(CO\s*\d{1,2})\s*$', re.IGNORECASE) def normalise_co(raw: str) -> str: num = re.search(r'\d+', raw) return f"CO{num.group()}" if num else raw.strip().upper().replace(" ", "") def clean(line: str) -> str: return line.strip() def is_skip(line: str) -> bool: return clean(line).lower() in SKIP_LINES or not clean(line) # ── Token classifier ───────────────────────────────────────────────────────── # We first convert the raw lines into a flat token list, then run the state # machine over tokens. This avoids ambiguity between a standalone "4" being # a marks value vs a question number. TOKEN_PART = "PART" TOKEN_OR = "OR" TOKEN_QNUM = "QNUM" # question number (context-dependent) TOKEN_SUB = "SUB" # a / b / c / d / e TOKEN_BLOOMS = "BLOOMS" TOKEN_CO = "CO" TOKEN_MARKS = "MARKS" # 1-25 TOKEN_TEXT = "TEXT" def tokenise(lines: list[str]) -> list[tuple[str, str]]: tokens: list[tuple[str, str]] = [] for raw in lines: line = clean(raw) if not line: continue if IS_PAGE_BREAK.match(line): continue if is_skip(line): continue if IS_PART_HEADER.match(line): tokens.append((TOKEN_PART, line)) continue if IS_OR.match(line): tokens.append((TOKEN_OR, line)) continue if IS_BLOOMS.match(line): key = line.strip().upper().replace(" ", "") tokens.append((TOKEN_BLOOMS, BLOOMS_MAP.get(key.lower(), key))) continue if IS_CO.match(line): tokens.append((TOKEN_CO, normalise_co(line))) continue if IS_SUB_LABEL.match(line): tokens.append((TOKEN_SUB, line.strip().lower())) continue if IS_QUESTION_NUM.match(line): # Ambiguous — could be marks or question number. # We emit a special AMBIG token and resolve in the state machine. tokens.append(("AMBIG", line.strip())) continue tokens.append((TOKEN_TEXT, line)) return tokens def parse_question_paper(text: str) -> dict[str, Any]: lines = text.splitlines() tokens = tokenise(lines) warnings: list[str] = [] parts: list[dict] = [] part_number = 0 current_q1: int | None = None current_q2: int | None = None after_or = False # per-sub state sub_label = "" marks = 0 blooms = "" co_number = "" in_sub = False # sequence tracker: after SUB we expect TEXT* MARKS BLOOMS CO saw_marks = False saw_blooms = False def get_or_create_part() -> dict: for p in parts: if p["partNumber"] == part_number: return p p = { "partNumber": part_number, "question1Number": current_q1 or 1, "question2Number": current_q2 or (current_q1 + 1 if current_q1 else 2), "subQuestions": [], } parts.append(p) return p def commit_sub(): nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms p = get_or_create_part() if after_or and current_q2: p["question2Number"] = current_q2 p["subQuestions"].append({ "label": sub_label, "marks": marks, "bloomsLevel": blooms, "co_number": co_number, }) in_sub = False sub_label = "" marks = 0 blooms = "" co_number = "" saw_marks = False saw_blooms = False def reset_sub(): nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms in_sub = False sub_label = "" marks = 0 blooms = "" co_number = "" saw_marks = False saw_blooms = False for kind, value in tokens: # ── PART / OR / QNUM bookkeeping (only when not mid-sub) ──────────── if kind == TOKEN_PART: reset_sub() after_or = False continue if kind == TOKEN_OR: if in_sub: reset_sub() after_or = True continue # AMBIG: resolve as QNUM only when we are NOT waiting for marks/blooms/CO if kind == "AMBIG": val = int(value) if in_sub and not saw_marks and 1 <= val <= 25: # treat as marks marks = val saw_marks = True continue elif not in_sub: # treat as question number if not after_or: current_q1 = val current_q2 = val + 1 part_number += 1 else: current_q2 = val for p in parts: if p["partNumber"] == part_number: p["question2Number"] = val continue # else it's noise (e.g. array values in question text) — skip continue # ── Sub-question label ─────────────────────────────────────────────── if kind == TOKEN_SUB: if in_sub and saw_marks and saw_blooms and co_number: commit_sub() elif in_sub: reset_sub() sub_label = value in_sub = True continue # ── Within a sub-question ──────────────────────────────────────────── if in_sub: if kind == TOKEN_BLOOMS and saw_marks: blooms = value saw_blooms = True continue if kind == TOKEN_CO and saw_blooms: co_number = value commit_sub() continue # TEXT is question body — ignore for extraction if kind == TOKEN_TEXT: continue # Anything else outside a sub — ignore continue # flush if in_sub and marks > 0: commit_sub() # ── Warnings ───────────────────────────────────────────────────────────── if not parts: warnings.append("No sub-questions detected. The paper layout may be unusual.") else: missing_co = sum(1 for p in parts for sq in p["subQuestions"] if not sq["co_number"]) missing_bl = sum(1 for p in parts for sq in p["subQuestions"] if not sq["bloomsLevel"]) if missing_co: warnings.append(f"{missing_co} sub-question(s) have no CO detected — please fill manually.") if missing_bl: warnings.append(f"{missing_bl} sub-question(s) have no Bloom's level detected — please fill manually.") total_marks = sum(sq["marks"] for p in parts for sq in p["subQuestions"]) return { "question_parts": parts, "total_marks": total_marks, "warnings": warnings, }