Spaces:

bhuvan-2005
/

question-extractor

Sleeping

bhuvan-2005 commited on Nov 17, 2025

Commit

1a54ec7

verified ·

1 Parent(s): 5190f7f

Update question_extractor.py

Files changed (1) hide show

question_extractor.py CHANGED Viewed

@@ -196,14 +196,26 @@ def extract_subject_name(text):
                     return re.sub(r'\s+', ' ', subject)
     # 3) Fallback: look for a line that looks like a course title (contains
-    # words like Fundamentals, Mathematics, Engineering, etc.)
     keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
-    for line in lines:
-        lower = line.lower()
-        if any(k in lower for k in keywords):
-            candidate = re.sub(r'[|].*', '', line).strip()
-            if candidate:
-                return re.sub(r'\s+', ' ', candidate)
     return "Unknown Subject"

                     return re.sub(r'\s+', ' ', subject)
     # 3) Fallback: look for a line that looks like a course title (contains
+    # words like Fundamentals, Mathematics, Engineering, etc.). To avoid
+    # mislabelling mid-page question text (e.g. when we only see the
+    # backside/table like qp003), only enable this fallback if we have
+    # already seen some evidence of a proper header (Programme, Course
+    # Code, etc.) elsewhere in the page.
+    header_hint_tokens = [
+        'programme', 'program', 'course code', 'course title', 'subject',
+        'paper title', 'assessment test', 'continuous assessment', 'cat',
+        'max. mark', 'semester', 'slot'
+    ]
+    has_header_hints = any(tok in text.lower() for tok in header_hint_tokens)
     keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
+    if has_header_hints:
+        for line in lines:
+            lower = line.lower()
+            if any(k in lower for k in keywords):
+                candidate = re.sub(r'[|].*', '', line).strip()
+                if candidate:
+                    return re.sub(r'\s+', ' ', candidate)
     return "Unknown Subject"