Spaces:
Sleeping
Sleeping
Update question_extractor.py
Browse files- question_extractor.py +19 -7
question_extractor.py
CHANGED
|
@@ -196,14 +196,26 @@ def extract_subject_name(text):
|
|
| 196 |
return re.sub(r'\s+', ' ', subject)
|
| 197 |
|
| 198 |
# 3) Fallback: look for a line that looks like a course title (contains
|
| 199 |
-
# words like Fundamentals, Mathematics, Engineering, etc.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
| 207 |
|
| 208 |
return "Unknown Subject"
|
| 209 |
|
|
|
|
| 196 |
return re.sub(r'\s+', ' ', subject)
|
| 197 |
|
| 198 |
# 3) Fallback: look for a line that looks like a course title (contains
|
| 199 |
+
# words like Fundamentals, Mathematics, Engineering, etc.). To avoid
|
| 200 |
+
# mislabelling mid-page question text (e.g. when we only see the
|
| 201 |
+
# backside/table like qp003), only enable this fallback if we have
|
| 202 |
+
# already seen some evidence of a proper header (Programme, Course
|
| 203 |
+
# Code, etc.) elsewhere in the page.
|
| 204 |
+
header_hint_tokens = [
|
| 205 |
+
'programme', 'program', 'course code', 'course title', 'subject',
|
| 206 |
+
'paper title', 'assessment test', 'continuous assessment', 'cat',
|
| 207 |
+
'max. mark', 'semester', 'slot'
|
| 208 |
+
]
|
| 209 |
+
has_header_hints = any(tok in text.lower() for tok in header_hint_tokens)
|
| 210 |
+
|
| 211 |
keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
|
| 212 |
+
if has_header_hints:
|
| 213 |
+
for line in lines:
|
| 214 |
+
lower = line.lower()
|
| 215 |
+
if any(k in lower for k in keywords):
|
| 216 |
+
candidate = re.sub(r'[|].*', '', line).strip()
|
| 217 |
+
if candidate:
|
| 218 |
+
return re.sub(r'\s+', ' ', candidate)
|
| 219 |
|
| 220 |
return "Unknown Subject"
|
| 221 |
|