File size: 8,413 Bytes
d81169f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | import re
from typing import Any
SKIP_LINES = {
"q. no.", "questions", "marks", "cl", "co", "co no.", "co description",
"cognitive", "level", "remember", "understand", "apply", "analyze",
"analyse", "evaluate", "create", "course outcomes", "usn", "no.",
}
BLOOMS_MAP = {
"cl1": "CL1", "cl2": "CL2", "cl3": "CL3",
"cl4": "CL4", "cl5": "CL5", "cl6": "CL6",
"btl1": "CL1", "btl2": "CL2", "btl3": "CL3",
"btl4": "CL4", "btl5": "CL5", "btl6": "CL6",
"l1": "CL1", "l2": "CL2", "l3": "CL3",
"l4": "CL4", "l5": "CL5", "l6": "CL6",
}
IS_PART_HEADER = re.compile(r'^\s*PART\s+[A-Z]\s*$', re.IGNORECASE)
IS_OR = re.compile(r'^\s*OR\s*$', re.IGNORECASE)
IS_PAGE_BREAK = re.compile(r'---\s*PAGE BREAK\s*---', re.IGNORECASE)
IS_QUESTION_NUM = re.compile(r'^\s*(\d{1,2})\s*$')
IS_SUB_LABEL = re.compile(r'^\s*([a-e])\s*$', re.IGNORECASE)
IS_BLOOMS = re.compile(r'^\s*(CL[1-6]|BTL[1-6]|L[1-6])\s*$', re.IGNORECASE)
IS_CO = re.compile(r'^\s*(CO\s*\d{1,2})\s*$', re.IGNORECASE)
def normalise_co(raw: str) -> str:
num = re.search(r'\d+', raw)
return f"CO{num.group()}" if num else raw.strip().upper().replace(" ", "")
def clean(line: str) -> str:
return line.strip()
def is_skip(line: str) -> bool:
return clean(line).lower() in SKIP_LINES or not clean(line)
# ββ Token classifier βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# We first convert the raw lines into a flat token list, then run the state
# machine over tokens. This avoids ambiguity between a standalone "4" being
# a marks value vs a question number.
TOKEN_PART = "PART"
TOKEN_OR = "OR"
TOKEN_QNUM = "QNUM" # question number (context-dependent)
TOKEN_SUB = "SUB" # a / b / c / d / e
TOKEN_BLOOMS = "BLOOMS"
TOKEN_CO = "CO"
TOKEN_MARKS = "MARKS" # 1-25
TOKEN_TEXT = "TEXT"
def tokenise(lines: list[str]) -> list[tuple[str, str]]:
tokens: list[tuple[str, str]] = []
for raw in lines:
line = clean(raw)
if not line:
continue
if IS_PAGE_BREAK.match(line):
continue
if is_skip(line):
continue
if IS_PART_HEADER.match(line):
tokens.append((TOKEN_PART, line))
continue
if IS_OR.match(line):
tokens.append((TOKEN_OR, line))
continue
if IS_BLOOMS.match(line):
key = line.strip().upper().replace(" ", "")
tokens.append((TOKEN_BLOOMS, BLOOMS_MAP.get(key.lower(), key)))
continue
if IS_CO.match(line):
tokens.append((TOKEN_CO, normalise_co(line)))
continue
if IS_SUB_LABEL.match(line):
tokens.append((TOKEN_SUB, line.strip().lower()))
continue
if IS_QUESTION_NUM.match(line):
# Ambiguous β could be marks or question number.
# We emit a special AMBIG token and resolve in the state machine.
tokens.append(("AMBIG", line.strip()))
continue
tokens.append((TOKEN_TEXT, line))
return tokens
def parse_question_paper(text: str) -> dict[str, Any]:
lines = text.splitlines()
tokens = tokenise(lines)
warnings: list[str] = []
parts: list[dict] = []
part_number = 0
current_q1: int | None = None
current_q2: int | None = None
after_or = False
# per-sub state
sub_label = ""
marks = 0
blooms = ""
co_number = ""
in_sub = False
# sequence tracker: after SUB we expect TEXT* MARKS BLOOMS CO
saw_marks = False
saw_blooms = False
def get_or_create_part() -> dict:
for p in parts:
if p["partNumber"] == part_number:
return p
p = {
"partNumber": part_number,
"question1Number": current_q1 or 1,
"question2Number": current_q2 or (current_q1 + 1 if current_q1 else 2),
"subQuestions": [],
}
parts.append(p)
return p
def commit_sub():
nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
p = get_or_create_part()
if after_or and current_q2:
p["question2Number"] = current_q2
p["subQuestions"].append({
"label": sub_label,
"marks": marks,
"bloomsLevel": blooms,
"co_number": co_number,
})
in_sub = False
sub_label = ""
marks = 0
blooms = ""
co_number = ""
saw_marks = False
saw_blooms = False
def reset_sub():
nonlocal in_sub, sub_label, marks, blooms, co_number, saw_marks, saw_blooms
in_sub = False
sub_label = ""
marks = 0
blooms = ""
co_number = ""
saw_marks = False
saw_blooms = False
for kind, value in tokens:
# ββ PART / OR / QNUM bookkeeping (only when not mid-sub) ββββββββββββ
if kind == TOKEN_PART:
reset_sub()
after_or = False
continue
if kind == TOKEN_OR:
if in_sub:
reset_sub()
after_or = True
continue
# AMBIG: resolve as QNUM only when we are NOT waiting for marks/blooms/CO
if kind == "AMBIG":
val = int(value)
if in_sub and not saw_marks and 1 <= val <= 25:
# treat as marks
marks = val
saw_marks = True
continue
elif not in_sub:
# treat as question number
if not after_or:
current_q1 = val
current_q2 = val + 1
part_number += 1
else:
current_q2 = val
for p in parts:
if p["partNumber"] == part_number:
p["question2Number"] = val
continue
# else it's noise (e.g. array values in question text) β skip
continue
# ββ Sub-question label βββββββββββββββββββββββββββββββββββββββββββββββ
if kind == TOKEN_SUB:
if in_sub and saw_marks and saw_blooms and co_number:
commit_sub()
elif in_sub:
reset_sub()
sub_label = value
in_sub = True
continue
# ββ Within a sub-question ββββββββββββββββββββββββββββββββββββββββββββ
if in_sub:
if kind == TOKEN_BLOOMS and saw_marks:
blooms = value
saw_blooms = True
continue
if kind == TOKEN_CO and saw_blooms:
co_number = value
commit_sub()
continue
# TEXT is question body β ignore for extraction
if kind == TOKEN_TEXT:
continue
# Anything else outside a sub β ignore
continue
# flush
if in_sub and marks > 0:
commit_sub()
# ββ Warnings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if not parts:
warnings.append("No sub-questions detected. The paper layout may be unusual.")
else:
missing_co = sum(1 for p in parts for sq in p["subQuestions"] if not sq["co_number"])
missing_bl = sum(1 for p in parts for sq in p["subQuestions"] if not sq["bloomsLevel"])
if missing_co:
warnings.append(f"{missing_co} sub-question(s) have no CO detected β please fill manually.")
if missing_bl:
warnings.append(f"{missing_bl} sub-question(s) have no Bloom's level detected β please fill manually.")
total_marks = sum(sq["marks"] for p in parts for sq in p["subQuestions"])
return {
"question_parts": parts,
"total_marks": total_marks,
"warnings": warnings,
} |