bhuvan-2005 commited on
Commit
1a54ec7
·
verified ·
1 Parent(s): 5190f7f

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +19 -7
question_extractor.py CHANGED
@@ -196,14 +196,26 @@ def extract_subject_name(text):
196
  return re.sub(r'\s+', ' ', subject)
197
 
198
  # 3) Fallback: look for a line that looks like a course title (contains
199
- # words like Fundamentals, Mathematics, Engineering, etc.)
 
 
 
 
 
 
 
 
 
 
 
200
  keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
201
- for line in lines:
202
- lower = line.lower()
203
- if any(k in lower for k in keywords):
204
- candidate = re.sub(r'[|].*', '', line).strip()
205
- if candidate:
206
- return re.sub(r'\s+', ' ', candidate)
 
207
 
208
  return "Unknown Subject"
209
 
 
196
  return re.sub(r'\s+', ' ', subject)
197
 
198
  # 3) Fallback: look for a line that looks like a course title (contains
199
+ # words like Fundamentals, Mathematics, Engineering, etc.). To avoid
200
+ # mislabelling mid-page question text (e.g. when we only see the
201
+ # backside/table like qp003), only enable this fallback if we have
202
+ # already seen some evidence of a proper header (Programme, Course
203
+ # Code, etc.) elsewhere in the page.
204
+ header_hint_tokens = [
205
+ 'programme', 'program', 'course code', 'course title', 'subject',
206
+ 'paper title', 'assessment test', 'continuous assessment', 'cat',
207
+ 'max. mark', 'semester', 'slot'
208
+ ]
209
+ has_header_hints = any(tok in text.lower() for tok in header_hint_tokens)
210
+
211
  keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
212
+ if has_header_hints:
213
+ for line in lines:
214
+ lower = line.lower()
215
+ if any(k in lower for k in keywords):
216
+ candidate = re.sub(r'[|].*', '', line).strip()
217
+ if candidate:
218
+ return re.sub(r'\s+', ' ', candidate)
219
 
220
  return "Unknown Subject"
221