Spaces:

bhuvan-2005
/

question-extractor

Sleeping

App Files Files Community

bhuvan-2005 commited on Nov 17, 2025

Commit

d0aa538

verified ·

1 Parent(s): ec1c3fd

Update question_extractor.py

Browse files

Files changed (1) hide show

question_extractor.py +91 -65

question_extractor.py CHANGED Viewed

@@ -85,13 +85,16 @@ def extract_text_from_image(image_path):
 def extract_subject_name(text):
-    """
-    Extract the subject name from the OCR text.
-    This version is subject-agnostic: it does **not** hard-code any
-    specific course name. It tries to infer the subject from common
-    headers like "Course Title" or "Subject" and otherwise returns
-    "Unknown Subject".
     """
     # First try a simple global search for a "Course:" style pattern
     m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE)
@@ -103,9 +106,49 @@ def extract_subject_name(text):
     # Normalise line endings
     lines = text.split('\\n')
-    # First, look for explicit course/subject headers
     header_patterns = [
-        r'Course\\s*Code\\s*&\\s*Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
         r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
         r'Subject\\s*[:\\-]?\\s*(.+)$',
         r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$',
@@ -113,22 +156,48 @@ def extract_subject_name(text):
         r'.*Course\\s*[:\\-]\\s*(.+)$',
     ]
-    for line in lines:
-        clean_line = re.sub(r'\s+', ' ', line).strip()
         if not clean_line:
             continue
         for pattern in header_patterns:
             m = re.search(pattern, clean_line, re.IGNORECASE)
             if m:
                 subject = m.group(1).strip()
                 # Remove obvious trailing columns (like Semester, Class No, etc.)
-                subject = re.split(r'\s{2,}|\s{1,}\|', subject)[0].strip()
                 subject = re.sub(r'[|].*', '', subject).strip()
-                return subject
-    # Fallback: look for a line that looks like a course title (contains
     # words like Fundamentals, Mathematics, Engineering, etc.)
-    keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry']
     for line in lines:
         lower = line.lower()
         if any(k in lower for k in keywords):
@@ -598,60 +667,15 @@ def process_question_paper(image_path, output_path):
     # Use text-line based generic extraction as the primary method.
     questions = extract_questions_from_text(text)
-    # IoT-specific repair: only for the two known IoT sample images.
-    # We detect them by filename so that other subjects stay generic.
-    text_lower = text.lower()
-    img_name = os.path.basename(image_path).lower()
-    is_known_iot_paper = img_name.startswith('whatsapp image 2025-11-15 at 4.20.18 pm')
-    if is_known_iot_paper:
-        print("Using IoT-specific fallback extraction method...")
-        fallback_questions = []
-        is_first_page = any(keyword in text_lower for keyword in ['city council', 'connected cars', 'smart agriculture', 'startup'])
-        is_second_page = any(keyword in text_lower for keyword in ['smart camera', 'gateway', '192.168'])
-        if is_first_page:
-            fallback_questions.extend([
-                {
-                    'number': '1',
-                    'question': 'A city council is considering the deployment of a smart traffic management system that uses IoT-enabled traffic lights, connected CCTV cameras, and vehicle sensors to reduce congestion and improve emergency response times. The system will rely on a central control platform to process data in real time and dynamically adjust traffic flows. As part of the evaluation team, you are tasked with preparing an assessment that highlights the cost implications of implementing such a system, including both the resources needed for deployment and the potential benefits it could bring to the city in the long run. i) Identify various components involved in the cost evaluation. (4 marks) ii) Describe how each of these components would influence both the short-term expenditure and the long-term value of the project. (6 marks)',
-                    'marks': '10'
-                },
-                {
-                    'number': '2',
-                    'question': 'Consider the case of connected cars and smart meters deployed in an industry. Compare and contrast these two cases in terms of their primary purpose, goals, and challenges. Explain how the focus of IoT deployment differs between a consumer-oriented system like connected cars and an infrastructure-oriented system like smart meters.',
-                    'marks': '10'
-                },
-                {
-                    'number': '3',
-                    'question': 'A smart agriculture startup is developing an IoT-based prototype to monitor soil moisture, track weather conditions, and automate irrigation scheduling. Describe how the different stages of prototype development can be implemented for this system, starting from the initial concept to testing and refinement. In your answer, explain each stage in detail, including: how the problem is defined and requirements are gathered, the design and system architecture, the development of the prototype using IoT sensors, controllers, and cloud platforms, the testing strategies used to validate accuracy and reliability in field conditions, and the refinement process to improve performance, reduce costs, and ensure usability for farmers.',
-                    'marks': '10'
-                },
-            ])
-        if is_second_page:
-            fallback_questions.extend([
-                {
-                    'number': '4',
-                    'question': 'A smart camera system is deployed in public and private spaces to capture video streams for monitoring, surveillance, and automation purposes. Such systems handle sensitive personal data that could potentially affect user privacy and security. i) Identify the major data privacy and protection challenges involved in this IoT system, and explain how regulatory frameworks, and international standards can be applied to ensure lawful data collection, storage, processing, and deletion. (5 marks) ii) Illustrate your answer with specific examples of compliance measures that a smart camera system must adopt. (3 marks) iii) Discuss the regulatory implications if video data captured by the smart camera system is stored or processed in a different country. How should the system ensure compliance with international data transfer laws? (2 marks)',
-                    'marks': '10'
-                },
-                {
-                    'number': '5',
-                    'question': "A manufacturer's smart-light gateway exposes a local web management API at http://192.168.0.10:8080. A malicious website persuades a user to visit it from the same LAN. The webpage repeatedly resolves its domain to different IPs and then attempts to send HTTP requests to http://192.168.0.10:8080 from the visitor's browser. Explain how this sequence of events could allow the remote webpage to interact with the gateway's local API, what makes the gateway vulnerable, and propose three practical mitigations at the device, browser, and network levels.",
-                    'marks': '10'
-                },
-            ])
-        questions = sorted(fallback_questions, key=lambda x: int(x['number']))
     # Write out the results
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\\n\\n")
         f.write("QUESTIONS\\n\\n")
         for q in questions:
-            f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\\n\\n")
     print(f"Extracted content saved to: {output_path}")
     return subject, questions
@@ -710,9 +734,11 @@ def process_pdf_question_paper(pdf_path, output_path):
     # Write combined results for the whole PDF
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\n\n")
         f.write("QUESTIONS\n\n")
         for q in all_questions:
-            f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\n\n")
     print(f"Extracted content saved to: {output_path}")
     return subject, all_questions

 def extract_subject_name(text):
+    """Infer the subject name from OCR text.
+    Strategy (in order):
+    - Look for rich "Course Code & Course Title" style headers and try to
+      reconstruct the full subject (e.g. "Network Security and
+      Cryptography Fundamentals").
+    - Look for "Course Title" / "Subject" style lines, allowing for
+      common OCR corruptions like "ourse Title".
+    - As a final fallback, pick any line that looks like a course title
+      based on keywords.
     """
     # First try a simple global search for a "Course:" style pattern
     m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE)
     # Normalise line endings
     lines = text.split('\\n')
+    # 1) Special handling for lines that contain both "Course Code" and
+    # "Course Title" – these often embed both the subject and the code on
+    # a single noisy line.
+    for line in lines:
+        raw = re.sub(r'\s+', ' ', line).strip()
+        if not raw:
+            continue
+        lower = raw.lower()
+        if 'course code' in lower and 'course title' in lower:
+            # Try to capture patterns like:
+            #   "Course Code & CSE1029-Network Security and Course Title
+            #    Cryptography Fundamentals Faculty : ..."
+            m = re.search(
+                r'Course\s*Code[^A-Za-z0-9]+(?P<code>[A-Za-z0-9]+)\s*[-:]?\s*(?P<part1>[^:]*?)\s*(?:and\s+Course\s*Title\s+(?P<part2>[^:]+))?',
+                raw,
+                re.IGNORECASE,
+            )
+            if m:
+                part1 = (m.group('part1') or '').strip()
+                part2 = (m.group('part2') or '').strip()
+                subject_parts = []
+                if part1:
+                    subject_parts.append(part1)
+                if part2:
+                    subject_parts.append(part2)
+                if subject_parts:
+                    subject = ' and '.join(subject_parts)
+                else:
+                    # Fallback: take everything after "Course Title"
+                    idx = lower.find('course title')
+                    subject = raw[idx + len('course title'):].strip()
+                # Cut off trailing metadata like Faculty/Answer all/etc.
+                subject = re.split(
+                    r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
+                    subject,
+                    maxsplit=1,
+                )[0].strip()
+                subject = re.sub(r'[|].*', '', subject).strip()
+                if subject:
+                    return re.sub(r'\s+', ' ', subject)
+    # 2) Generic course/subject header patterns
     header_patterns = [
         r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
         r'Subject\\s*[:\\-]?\\s*(.+)$',
         r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$',
         r'.*Course\\s*[:\\-]\\s*(.+)$',
     ]
+    for i, line in enumerate(lines):
+        raw = line
+        clean_line = re.sub(r'\s+', ' ', raw).strip()
         if not clean_line:
             continue
+        # Allow for OCR-mangled "Course Title" such as "ourse Title".
+        lower = clean_line.lower()
+        if 'title' in lower and ('course' in lower or 'ourse' in lower):
+            idx = lower.find('title')
+            after = clean_line[idx + len('title'):].strip()
+            # Sometimes the actual title is on the next line; if the
+            # remainder is too short, append the next line.
+            if len(after) < 6 and i + 1 < len(lines):
+                after = (after + ' ' + re.sub(r'\s+', ' ', lines[i + 1]).strip()).strip()
+            subject = after
+            subject = re.split(
+                r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
+                subject,
+                maxsplit=1,
+            )[0].strip()
+            subject = re.sub(r'[|].*', '', subject).strip()
+            if subject:
+                return re.sub(r'\s+', ' ', subject)
         for pattern in header_patterns:
             m = re.search(pattern, clean_line, re.IGNORECASE)
             if m:
                 subject = m.group(1).strip()
                 # Remove obvious trailing columns (like Semester, Class No, etc.)
+                subject = re.split(
+                    r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
+                    subject,
+                    maxsplit=1,
+                )[0].strip()
                 subject = re.sub(r'[|].*', '', subject).strip()
+                if subject:
+                    return re.sub(r'\s+', ' ', subject)
+    # 3) Fallback: look for a line that looks like a course title (contains
     # words like Fundamentals, Mathematics, Engineering, etc.)
+    keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
     for line in lines:
         lower = line.lower()
         if any(k in lower for k in keywords):
     # Use text-line based generic extraction as the primary method.
     questions = extract_questions_from_text(text)
     # Write out the results
+    with open(output_path, 'w', encoding='utf-8') as f:
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\\n\\n")
+        f.write(f"Total Questions: {len(questions)}\\n\\n")
         f.write("QUESTIONS\\n\\n")
         for q in questions:
+            f.write(f"Q{q['number']} ({q['marks']} marks):\\n")
+            f.write(f"{q['question']}\\n\\n")
     print(f"Extracted content saved to: {output_path}")
     return subject, questions
     # Write combined results for the whole PDF
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\n\n")
+        f.write(f"Total Questions: {len(all_questions)}\n\n")
         f.write("QUESTIONS\n\n")
         for q in all_questions:
+            f.write(f"Q{q['number']} ({q['marks']} marks):\n")
+            f.write(f"{q['question']}\n\n")
     print(f"Extracted content saved to: {output_path}")
     return subject, all_questions