bhuvan-2005 commited on
Commit
d0aa538
·
verified ·
1 Parent(s): ec1c3fd

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +91 -65
question_extractor.py CHANGED
@@ -85,13 +85,16 @@ def extract_text_from_image(image_path):
85
 
86
 
87
  def extract_subject_name(text):
88
- """
89
- Extract the subject name from the OCR text.
90
-
91
- This version is subject-agnostic: it does **not** hard-code any
92
- specific course name. It tries to infer the subject from common
93
- headers like "Course Title" or "Subject" and otherwise returns
94
- "Unknown Subject".
 
 
 
95
  """
96
  # First try a simple global search for a "Course:" style pattern
97
  m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE)
@@ -103,9 +106,49 @@ def extract_subject_name(text):
103
  # Normalise line endings
104
  lines = text.split('\\n')
105
 
106
- # First, look for explicit course/subject headers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  header_patterns = [
108
- r'Course\\s*Code\\s*&\\s*Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
109
  r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
110
  r'Subject\\s*[:\\-]?\\s*(.+)$',
111
  r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$',
@@ -113,22 +156,48 @@ def extract_subject_name(text):
113
  r'.*Course\\s*[:\\-]\\s*(.+)$',
114
  ]
115
 
116
- for line in lines:
117
- clean_line = re.sub(r'\s+', ' ', line).strip()
 
118
  if not clean_line:
119
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  for pattern in header_patterns:
121
  m = re.search(pattern, clean_line, re.IGNORECASE)
122
  if m:
123
  subject = m.group(1).strip()
124
  # Remove obvious trailing columns (like Semester, Class No, etc.)
125
- subject = re.split(r'\s{2,}|\s{1,}\|', subject)[0].strip()
 
 
 
 
126
  subject = re.sub(r'[|].*', '', subject).strip()
127
- return subject
 
128
 
129
- # Fallback: look for a line that looks like a course title (contains
130
  # words like Fundamentals, Mathematics, Engineering, etc.)
131
- keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry']
132
  for line in lines:
133
  lower = line.lower()
134
  if any(k in lower for k in keywords):
@@ -598,60 +667,15 @@ def process_question_paper(image_path, output_path):
598
  # Use text-line based generic extraction as the primary method.
599
  questions = extract_questions_from_text(text)
600
 
601
- # IoT-specific repair: only for the two known IoT sample images.
602
- # We detect them by filename so that other subjects stay generic.
603
- text_lower = text.lower()
604
- img_name = os.path.basename(image_path).lower()
605
- is_known_iot_paper = img_name.startswith('whatsapp image 2025-11-15 at 4.20.18 pm')
606
-
607
- if is_known_iot_paper:
608
- print("Using IoT-specific fallback extraction method...")
609
- fallback_questions = []
610
-
611
- is_first_page = any(keyword in text_lower for keyword in ['city council', 'connected cars', 'smart agriculture', 'startup'])
612
- is_second_page = any(keyword in text_lower for keyword in ['smart camera', 'gateway', '192.168'])
613
-
614
- if is_first_page:
615
- fallback_questions.extend([
616
- {
617
- 'number': '1',
618
- 'question': 'A city council is considering the deployment of a smart traffic management system that uses IoT-enabled traffic lights, connected CCTV cameras, and vehicle sensors to reduce congestion and improve emergency response times. The system will rely on a central control platform to process data in real time and dynamically adjust traffic flows. As part of the evaluation team, you are tasked with preparing an assessment that highlights the cost implications of implementing such a system, including both the resources needed for deployment and the potential benefits it could bring to the city in the long run. i) Identify various components involved in the cost evaluation. (4 marks) ii) Describe how each of these components would influence both the short-term expenditure and the long-term value of the project. (6 marks)',
619
- 'marks': '10'
620
- },
621
- {
622
- 'number': '2',
623
- 'question': 'Consider the case of connected cars and smart meters deployed in an industry. Compare and contrast these two cases in terms of their primary purpose, goals, and challenges. Explain how the focus of IoT deployment differs between a consumer-oriented system like connected cars and an infrastructure-oriented system like smart meters.',
624
- 'marks': '10'
625
- },
626
- {
627
- 'number': '3',
628
- 'question': 'A smart agriculture startup is developing an IoT-based prototype to monitor soil moisture, track weather conditions, and automate irrigation scheduling. Describe how the different stages of prototype development can be implemented for this system, starting from the initial concept to testing and refinement. In your answer, explain each stage in detail, including: how the problem is defined and requirements are gathered, the design and system architecture, the development of the prototype using IoT sensors, controllers, and cloud platforms, the testing strategies used to validate accuracy and reliability in field conditions, and the refinement process to improve performance, reduce costs, and ensure usability for farmers.',
629
- 'marks': '10'
630
- },
631
- ])
632
-
633
- if is_second_page:
634
- fallback_questions.extend([
635
- {
636
- 'number': '4',
637
- 'question': 'A smart camera system is deployed in public and private spaces to capture video streams for monitoring, surveillance, and automation purposes. Such systems handle sensitive personal data that could potentially affect user privacy and security. i) Identify the major data privacy and protection challenges involved in this IoT system, and explain how regulatory frameworks, and international standards can be applied to ensure lawful data collection, storage, processing, and deletion. (5 marks) ii) Illustrate your answer with specific examples of compliance measures that a smart camera system must adopt. (3 marks) iii) Discuss the regulatory implications if video data captured by the smart camera system is stored or processed in a different country. How should the system ensure compliance with international data transfer laws? (2 marks)',
638
- 'marks': '10'
639
- },
640
- {
641
- 'number': '5',
642
- 'question': "A manufacturer's smart-light gateway exposes a local web management API at http://192.168.0.10:8080. A malicious website persuades a user to visit it from the same LAN. The webpage repeatedly resolves its domain to different IPs and then attempts to send HTTP requests to http://192.168.0.10:8080 from the visitor's browser. Explain how this sequence of events could allow the remote webpage to interact with the gateway's local API, what makes the gateway vulnerable, and propose three practical mitigations at the device, browser, and network levels.",
643
- 'marks': '10'
644
- },
645
- ])
646
-
647
- questions = sorted(fallback_questions, key=lambda x: int(x['number']))
648
-
649
  # Write out the results
 
650
  with open(output_path, 'w', encoding='utf-8') as f:
651
  f.write(f"Subject: {subject}\\n\\n")
 
652
  f.write("QUESTIONS\\n\\n")
653
  for q in questions:
654
- f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\\n\\n")
 
655
 
656
  print(f"Extracted content saved to: {output_path}")
657
  return subject, questions
@@ -710,9 +734,11 @@ def process_pdf_question_paper(pdf_path, output_path):
710
  # Write combined results for the whole PDF
711
  with open(output_path, 'w', encoding='utf-8') as f:
712
  f.write(f"Subject: {subject}\n\n")
 
713
  f.write("QUESTIONS\n\n")
714
  for q in all_questions:
715
- f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\n\n")
 
716
 
717
  print(f"Extracted content saved to: {output_path}")
718
  return subject, all_questions
 
85
 
86
 
87
  def extract_subject_name(text):
88
+ """Infer the subject name from OCR text.
89
+
90
+ Strategy (in order):
91
+ - Look for rich "Course Code & Course Title" style headers and try to
92
+ reconstruct the full subject (e.g. "Network Security and
93
+ Cryptography Fundamentals").
94
+ - Look for "Course Title" / "Subject" style lines, allowing for
95
+ common OCR corruptions like "ourse Title".
96
+ - As a final fallback, pick any line that looks like a course title
97
+ based on keywords.
98
  """
99
  # First try a simple global search for a "Course:" style pattern
100
  m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE)
 
106
  # Normalise line endings
107
  lines = text.split('\\n')
108
 
109
+ # 1) Special handling for lines that contain both "Course Code" and
110
+ # "Course Title" – these often embed both the subject and the code on
111
+ # a single noisy line.
112
+ for line in lines:
113
+ raw = re.sub(r'\s+', ' ', line).strip()
114
+ if not raw:
115
+ continue
116
+ lower = raw.lower()
117
+ if 'course code' in lower and 'course title' in lower:
118
+ # Try to capture patterns like:
119
+ # "Course Code & CSE1029-Network Security and Course Title
120
+ # Cryptography Fundamentals Faculty : ..."
121
+ m = re.search(
122
+ r'Course\s*Code[^A-Za-z0-9]+(?P<code>[A-Za-z0-9]+)\s*[-:]?\s*(?P<part1>[^:]*?)\s*(?:and\s+Course\s*Title\s+(?P<part2>[^:]+))?',
123
+ raw,
124
+ re.IGNORECASE,
125
+ )
126
+ if m:
127
+ part1 = (m.group('part1') or '').strip()
128
+ part2 = (m.group('part2') or '').strip()
129
+ subject_parts = []
130
+ if part1:
131
+ subject_parts.append(part1)
132
+ if part2:
133
+ subject_parts.append(part2)
134
+ if subject_parts:
135
+ subject = ' and '.join(subject_parts)
136
+ else:
137
+ # Fallback: take everything after "Course Title"
138
+ idx = lower.find('course title')
139
+ subject = raw[idx + len('course title'):].strip()
140
+ # Cut off trailing metadata like Faculty/Answer all/etc.
141
+ subject = re.split(
142
+ r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
143
+ subject,
144
+ maxsplit=1,
145
+ )[0].strip()
146
+ subject = re.sub(r'[|].*', '', subject).strip()
147
+ if subject:
148
+ return re.sub(r'\s+', ' ', subject)
149
+
150
+ # 2) Generic course/subject header patterns
151
  header_patterns = [
 
152
  r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
153
  r'Subject\\s*[:\\-]?\\s*(.+)$',
154
  r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$',
 
156
  r'.*Course\\s*[:\\-]\\s*(.+)$',
157
  ]
158
 
159
+ for i, line in enumerate(lines):
160
+ raw = line
161
+ clean_line = re.sub(r'\s+', ' ', raw).strip()
162
  if not clean_line:
163
  continue
164
+
165
+ # Allow for OCR-mangled "Course Title" such as "ourse Title".
166
+ lower = clean_line.lower()
167
+ if 'title' in lower and ('course' in lower or 'ourse' in lower):
168
+ idx = lower.find('title')
169
+ after = clean_line[idx + len('title'):].strip()
170
+ # Sometimes the actual title is on the next line; if the
171
+ # remainder is too short, append the next line.
172
+ if len(after) < 6 and i + 1 < len(lines):
173
+ after = (after + ' ' + re.sub(r'\s+', ' ', lines[i + 1]).strip()).strip()
174
+ subject = after
175
+ subject = re.split(
176
+ r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
177
+ subject,
178
+ maxsplit=1,
179
+ )[0].strip()
180
+ subject = re.sub(r'[|].*', '', subject).strip()
181
+ if subject:
182
+ return re.sub(r'\s+', ' ', subject)
183
+
184
  for pattern in header_patterns:
185
  m = re.search(pattern, clean_line, re.IGNORECASE)
186
  if m:
187
  subject = m.group(1).strip()
188
  # Remove obvious trailing columns (like Semester, Class No, etc.)
189
+ subject = re.split(
190
+ r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b',
191
+ subject,
192
+ maxsplit=1,
193
+ )[0].strip()
194
  subject = re.sub(r'[|].*', '', subject).strip()
195
+ if subject:
196
+ return re.sub(r'\s+', ' ', subject)
197
 
198
+ # 3) Fallback: look for a line that looks like a course title (contains
199
  # words like Fundamentals, Mathematics, Engineering, etc.)
200
+ keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security']
201
  for line in lines:
202
  lower = line.lower()
203
  if any(k in lower for k in keywords):
 
667
  # Use text-line based generic extraction as the primary method.
668
  questions = extract_questions_from_text(text)
669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  # Write out the results
671
+ with open(output_path, 'w', encoding='utf-8') as f:
672
  with open(output_path, 'w', encoding='utf-8') as f:
673
  f.write(f"Subject: {subject}\\n\\n")
674
+ f.write(f"Total Questions: {len(questions)}\\n\\n")
675
  f.write("QUESTIONS\\n\\n")
676
  for q in questions:
677
+ f.write(f"Q{q['number']} ({q['marks']} marks):\\n")
678
+ f.write(f"{q['question']}\\n\\n")
679
 
680
  print(f"Extracted content saved to: {output_path}")
681
  return subject, questions
 
734
  # Write combined results for the whole PDF
735
  with open(output_path, 'w', encoding='utf-8') as f:
736
  f.write(f"Subject: {subject}\n\n")
737
+ f.write(f"Total Questions: {len(all_questions)}\n\n")
738
  f.write("QUESTIONS\n\n")
739
  for q in all_questions:
740
+ f.write(f"Q{q['number']} ({q['marks']} marks):\n")
741
+ f.write(f"{q['question']}\n\n")
742
 
743
  print(f"Extracted content saved to: {output_path}")
744
  return subject, all_questions