Spaces:

MakPr016
/

qp-parser

Sleeping

File size: 8,413 Bytes

d81169f

import re
from typing import Any

SKIP_LINES = {
    "q. no.", "questions", "marks", "cl", "co", "co no.", "co description",
    "cognitive", "level", "remember", "understand", "apply", "analyze",
    "analyse", "evaluate", "create", "course outcomes", "usn", "no.",
}

BLOOMS_MAP = {
    "cl1": "CL1", "cl2": "CL2", "cl3": "CL3",
    "cl4": "CL4", "cl5": "CL5", "cl6": "CL6",
    "btl1": "CL1", "btl2": "CL2", "btl3": "CL3",
    "btl4": "CL4", "btl5": "CL5", "btl6": "CL6",
    "l1": "CL1", "l2": "CL2", "l3": "CL3",
    "l4": "CL4", "l5": "CL5", "l6": "CL6",
}

IS_PART_HEADER  = re.compile(r'^\s*PART\s+[A-Z]\s*$', re.IGNORECASE)
IS_OR           = re.compile(r'^\s*OR\s*$', re.IGNORECASE)
IS_PAGE_BREAK   = re.compile(r'---\s*PAGE BREAK\s*---', re.IGNORECASE)
IS_QUESTION_NUM = re.compile(r'^\s*(\d{1,2})\s*$')
IS_SUB_LABEL    = re.compile(r'^\s*([a-e])\s*$', re.IGNORECASE)
IS_BLOOMS       = re.compile(r'^\s*(CL[1-6]|BTL[1-6]|L[1-6])\s*$', re.IGNORECASE)
IS_CO           = re.compile(r'^\s*(CO\s*\d{1,2})\s*$', re.IGNORECASE)


def normalise_co(raw: str) -> str:
    num = re.search(r'\d+', raw)
    return f"CO{num.group()}" if num else raw.strip().upper().replace(" ", "")


def clean(line: str) -> str:
    return line.strip()


def is_skip(line: str) -> bool:
    return clean(line).lower() in SKIP_LINES or not clean(line)


# ── Token classifier ─────────────────────────────────────────────────────────
# We first convert the raw lines into a flat token list, then run the state
# machine over tokens. This avoids ambiguity between a standalone "4" being
# a marks value vs a question number.

TOKEN_PART    = "PART"
TOKEN_OR      = "OR"
TOKEN_QNUM    = "QNUM"      # question number (context-dependent)
TOKEN_SUB     = "SUB"       # a / b / c / d / e
TOKEN_BLOOMS  = "BLOOMS"
TOKEN_CO      = "CO"
TOKEN_MARKS   = "MARKS"     # 1-25
TOKEN_TEXT    = "TEXT"


def tokenise(lines: list[str]) -> list[tuple[str, str]]:
    tokens: list[tuple[str, str]] = []
    for raw in lines:
        line = clean(raw)
        if not line:
            continue
        if IS_PAGE_BREAK.match(line):
            continue
        if is_skip(line):
            continue
        if IS_PART_HEADER.match(line):
            tokens.append((TOKEN_PART, line))
            continue
        if IS_OR.match(line):
            tokens.append((TOKEN_OR, line))
            continue
        if IS_BLOOMS.match(line):
            key = line.strip().upper().replace(" ", "")
            tokens.append((TOKEN_BLOOMS, BLOOMS_MAP.get(key.lower(), key)))
            continue
        if IS_CO.match(line):
            tokens.append((TOKEN_CO, normalise_co(line)))
            continue
        if IS_SUB_LABEL.match(line):
            tokens.append((TOKEN_SUB, line.strip().lower()))
            continue
        if IS_QUESTION_NUM.match(line):
            # Ambiguous — could be marks or question number.
            # We emit a special AMBIG token and resolve in the state machine.
            tokens.append(("AMBIG", line.strip()))
            continue
        tokens.append((TOKEN_TEXT, line))
    return tokens


def parse_question_paper(text: str) -> dict[str, Any]:
    lines  = text.splitlines()
    tokens = tokenise(lines)
    warnings: list[str] = []

    parts: list[dict] = []
    part_number = 0
    current_q1: int | None = None
    current_q2: int | None = None
    after_or = False

    # per-sub state
    sub_label  = ""
    marks      = 0
    blooms     = ""
    co_number  = ""
    in_sub     = False
    # sequence tracker: after SUB we expect TEXT* MARKS BLOOMS CO
    saw_marks  = False
    saw_blooms = False

    def get_or_create_part() -> dict:
        for p in parts:
            if p["partNumber"] == part_number:
                return p
        p = {
            "partNumber":      part_number,
            "question1Number": current_q1 or 1,
            "question2Number": current_q2 or (current_q1 + 1 if current_q1 else 2),
            "subQuestions":    [],
        }
        parts.append(p)
        return p

    def commit_sub():
        nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
        p = get_or_create_part()
        if after_or and current_q2:
            p["question2Number"] = current_q2
        p["subQuestions"].append({
            "label":       sub_label,
            "marks":       marks,
            "bloomsLevel": blooms,
            "co_number":   co_number,
        })
        in_sub     = False
        sub_label  = ""
        marks      = 0
        blooms     = ""
        co_number  = ""
        saw_marks  = False
        saw_blooms = False

    def reset_sub():
        nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
        in_sub     = False
        sub_label  = ""
        marks      = 0
        blooms     = ""
        co_number  = ""
        saw_marks  = False
        saw_blooms = False

    for kind, value in tokens:

        # ── PART / OR / QNUM bookkeeping (only when not mid-sub) ────────────
        if kind == TOKEN_PART:
            reset_sub()
            after_or = False
            continue

        if kind == TOKEN_OR:
            if in_sub:
                reset_sub()
            after_or = True
            continue

        # AMBIG: resolve as QNUM only when we are NOT waiting for marks/blooms/CO
        if kind == "AMBIG":
            val = int(value)
            if in_sub and not saw_marks and 1 <= val <= 25:
                # treat as marks
                marks     = val
                saw_marks = True
                continue
            elif not in_sub:
                # treat as question number
                if not after_or:
                    current_q1  = val
                    current_q2  = val + 1
                    part_number += 1
                else:
                    current_q2 = val
                    for p in parts:
                        if p["partNumber"] == part_number:
                            p["question2Number"] = val
                continue
            # else it's noise (e.g. array values in question text) — skip
            continue

        # ── Sub-question label ───────────────────────────────────────────────
        if kind == TOKEN_SUB:
            if in_sub and saw_marks and saw_blooms and co_number:
                commit_sub()
            elif in_sub:
                reset_sub()
            sub_label = value
            in_sub    = True
            continue

        # ── Within a sub-question ────────────────────────────────────────────
        if in_sub:
            if kind == TOKEN_BLOOMS and saw_marks:
                blooms     = value
                saw_blooms = True
                continue

            if kind == TOKEN_CO and saw_blooms:
                co_number = value
                commit_sub()
                continue

            # TEXT is question body — ignore for extraction
            if kind == TOKEN_TEXT:
                continue

        # Anything else outside a sub — ignore
        continue

    # flush
    if in_sub and marks > 0:
        commit_sub()

    # ── Warnings ─────────────────────────────────────────────────────────────
    if not parts:
        warnings.append("No sub-questions detected. The paper layout may be unusual.")
    else:
        missing_co = sum(1 for p in parts for sq in p["subQuestions"] if not sq["co_number"])
        missing_bl = sum(1 for p in parts for sq in p["subQuestions"] if not sq["bloomsLevel"])
        if missing_co:
            warnings.append(f"{missing_co} sub-question(s) have no CO detected — please fill manually.")
        if missing_bl:
            warnings.append(f"{missing_bl} sub-question(s) have no Bloom's level detected — please fill manually.")

    total_marks = sum(sq["marks"] for p in parts for sq in p["subQuestions"])

    return {
        "question_parts": parts,
        "total_marks":    total_marks,
        "warnings":       warnings,
    }