Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Question Paper Extractor | |
| Extracts subject name and questions with marks from question paper images | |
| """ | |
| import os | |
| import re | |
| import sys | |
| import difflib | |
| import tempfile | |
| import pytesseract | |
| from pytesseract import Output | |
| from PIL import Image | |
| import cv2 | |
| import numpy as np | |
| # Allow large images from high-resolution PDFs (e.g. qp002) without | |
| # triggering Pillow's decompression bomb protection. The DPI we use is | |
| # modest, but some pages still exceed the default pixel limit. | |
| Image.MAX_IMAGE_PIXELS = None | |
| # Optional PDF support | |
| try: | |
| from pdf2image import convert_from_path # type: ignore | |
| except Exception: | |
| convert_from_path = None | |
| def preprocess_image(image_path): | |
| """ | |
| Preprocess the image for better OCR results | |
| """ | |
| # Read the image | |
| img = cv2.imread(image_path) | |
| # Convert to grayscale | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # Apply adaptive threshold for better results with varying lighting | |
| thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY, 11, 2) | |
| # Denoise the image | |
| denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21) | |
| # Apply dilation and erosion to remove noise | |
| kernel = np.ones((1, 1), np.uint8) | |
| denoised = cv2.morphologyEx(denoised, cv2.MORPH_CLOSE, kernel) | |
| # Save the preprocessed image temporarily | |
| temp_path = 'temp_processed.png' | |
| cv2.imwrite(temp_path, denoised) | |
| return temp_path | |
| def extract_text_from_image(image_path): | |
| """Extract text from image using OCR. | |
| We upsample the image and run Tesseract on the grayscale version with | |
| PSM 6. This configuration has been empirically found to work well for | |
| the VIT question paper scans in this project. | |
| """ | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| raise FileNotFoundError(f"Cannot read image: {image_path}") | |
| h, w = img.shape[:2] | |
| # Upscale small images to improve OCR; keep a modest factor for larger | |
| # ones to avoid unnecessary CPU load. | |
| max_side = max(h, w) | |
| if max_side < 1500: | |
| scale = 1.8 | |
| else: | |
| scale = 1.2 | |
| img_large = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) | |
| gray = cv2.cvtColor(img_large, cv2.COLOR_BGR2GRAY) | |
| custom_config = r'--oem 3 --psm 6' | |
| text = pytesseract.image_to_string(gray, config=custom_config) | |
| return text | |
| def extract_subject_name(text): | |
| """Infer the subject name from OCR text. | |
| Strategy (in order): | |
| - Look for rich "Course Code & Course Title" style headers and try to | |
| reconstruct the full subject (e.g. "Network Security and | |
| Cryptography Fundamentals"). | |
| - Look for "Course Title" / "Subject" style lines, allowing for | |
| common OCR corruptions like "ourse Title". | |
| - As a final fallback, pick any line that looks like a course title | |
| based on keywords. | |
| """ | |
| # First try a simple global search for a "Course:" style pattern | |
| m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE) | |
| if m: | |
| subject = m.group(1).strip() | |
| subject = re.sub(r'[|].*', '', subject).strip() | |
| return re.sub(r'\s+', ' ', subject) | |
| # Normalise line endings | |
| lines = text.split('\\n') | |
| # 1) Special handling for lines that contain both "Course Code" and | |
| # "Course Title" – these often embed both the subject and the code on | |
| # a single noisy line. | |
| for line in lines: | |
| raw = re.sub(r'\s+', ' ', line).strip() | |
| if not raw: | |
| continue | |
| lower = raw.lower() | |
| if 'course code' in lower and 'course title' in lower: | |
| # Try to capture patterns like: | |
| # "Course Code & CSE1029-Network Security and Course Title | |
| # Cryptography Fundamentals Faculty : ..." | |
| m = re.search( | |
| r'Course\s*Code[^A-Za-z0-9]+(?P<code>[A-Za-z0-9]+)\s*[-:]?\s*(?P<part1>[^:]*?)\s*(?:and\s+Course\s*Title\s+(?P<part2>[^:]+))?', | |
| raw, | |
| re.IGNORECASE, | |
| ) | |
| if m: | |
| part1 = (m.group('part1') or '').strip() | |
| part2 = (m.group('part2') or '').strip() | |
| subject_parts = [] | |
| if part1: | |
| subject_parts.append(part1) | |
| if part2: | |
| subject_parts.append(part2) | |
| if subject_parts: | |
| subject = ' and '.join(subject_parts) | |
| else: | |
| # Fallback: take everything after "Course Title" | |
| idx = lower.find('course title') | |
| subject = raw[idx + len('course title'):].strip() | |
| # Cut off trailing metadata like Faculty/Answer all/etc. | |
| subject = re.split( | |
| r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b', | |
| subject, | |
| maxsplit=1, | |
| )[0].strip() | |
| subject = re.sub(r'[|].*', '', subject).strip() | |
| if subject: | |
| return re.sub(r'\s+', ' ', subject) | |
| # 2) Generic course/subject header patterns | |
| header_patterns = [ | |
| r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$', | |
| r'Subject\\s*[:\\-]?\\s*(.+)$', | |
| r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$', | |
| # More generic: any line containing "Course:" where the rest looks like a title | |
| r'.*Course\\s*[:\\-]\\s*(.+)$', | |
| ] | |
| for i, line in enumerate(lines): | |
| raw = line | |
| clean_line = re.sub(r'\s+', ' ', raw).strip() | |
| if not clean_line: | |
| continue | |
| # Allow for OCR-mangled "Course Title" such as "ourse Title". | |
| lower = clean_line.lower() | |
| if 'title' in lower and ('course' in lower or 'ourse' in lower): | |
| idx = lower.find('title') | |
| after = clean_line[idx + len('title'):].strip() | |
| # Sometimes the actual title is on the next line; if the | |
| # remainder is too short, append the next line. | |
| if len(after) < 6 and i + 1 < len(lines): | |
| after = (after + ' ' + re.sub(r'\s+', ' ', lines[i + 1]).strip()).strip() | |
| subject = after | |
| subject = re.split( | |
| r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b', | |
| subject, | |
| maxsplit=1, | |
| )[0].strip() | |
| subject = re.sub(r'[|].*', '', subject).strip() | |
| if subject: | |
| return re.sub(r'\s+', ' ', subject) | |
| for pattern in header_patterns: | |
| m = re.search(pattern, clean_line, re.IGNORECASE) | |
| if m: | |
| subject = m.group(1).strip() | |
| # Remove obvious trailing columns (like Semester, Class No, etc.) | |
| subject = re.split( | |
| r'\b(Faculty|Answer all|Programme|Program|Time|Max\.\s*Marks?|Class\s+No\.?|Class\s+Nor)\b', | |
| subject, | |
| maxsplit=1, | |
| )[0].strip() | |
| subject = re.sub(r'[|].*', '', subject).strip() | |
| if subject: | |
| return re.sub(r'\s+', ' ', subject) | |
| # 3) Fallback: look for a line that looks like a course title (contains | |
| # words like Fundamentals, Mathematics, Engineering, etc.). To avoid | |
| # mislabelling mid-page question text (e.g. when we only see the | |
| # backside/table like qp003), only enable this fallback if we have | |
| # already seen some evidence of a proper header (Programme, Course | |
| # Code, etc.) elsewhere in the page. | |
| header_hint_tokens = [ | |
| 'programme', 'program', 'course code', 'course title', 'subject', | |
| 'paper title', 'assessment test', 'continuous assessment', 'cat', | |
| 'max. mark', 'semester', 'slot' | |
| ] | |
| has_header_hints = any(tok in text.lower() for tok in header_hint_tokens) | |
| keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry', 'analytics', 'security'] | |
| if has_header_hints: | |
| for line in lines: | |
| lower = line.lower() | |
| if any(k in lower for k in keywords): | |
| candidate = re.sub(r'[|].*', '', line).strip() | |
| if candidate: | |
| return re.sub(r'\s+', ' ', candidate) | |
| return "Unknown Subject" | |
| def _line_looks_like_question_start(text: str) -> bool: | |
| """Heuristic: does a line look like the start of a question? | |
| We look for either a scenario-style opener ("You are...", "Assume...", | |
| etc.) or an imperative verb at the beginning (after stripping bullets | |
| and quotes). Uses fuzzy matching to cope with OCR noise. | |
| """ | |
| if not text: | |
| return False | |
| # Strip leading non-letters (quotes, bullets, numbers, table pipes) | |
| s = re.sub(r'^[^A-Za-z]+', '', text).strip() | |
| if not s: | |
| return False | |
| lower_s = s.lower() | |
| # Scenario-style openers that typically mark the start of a main | |
| # question in these papers. | |
| if lower_s.startswith(("you ", "assume ", "consider ", "suppose ")): | |
| return True | |
| first = s.split()[0].lower() | |
| verbs = [ | |
| 'do', 'perform', 'design', 'explain', 'describe', 'compute', 'calculate', | |
| 'discuss', 'analyse', 'analyze', 'derive', 'prove', 'show', 'find', | |
| 'state', 'write', 'construct', 'draw', 'implement', 'develop', 'evaluate', | |
| 'justify', 'compare', 'contrast', 'discuss', 'outline', 'define', | |
| ] | |
| if first in verbs: | |
| return True | |
| if len(first) < 2: | |
| return False | |
| # Fuzzy match to handle common OCR misspellings (e.g. "Disuss" for | |
| # "Discuss") but avoid long non-verb words like "relationship" being | |
| # treated as verbs. A relatively high cutoff keeps this conservative. | |
| close = difflib.get_close_matches(first, verbs, n=1, cutoff=0.8) | |
| return bool(close) | |
| def extract_questions_with_layout(image_path): | |
| """Extract questions using spatial layout (question numbers in left column). | |
| This uses Tesseract's image_to_data to look for digit tokens near the | |
| left margin (question numbers) and groups the following lines as the | |
| question body until the next number. | |
| """ | |
| try: | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| return [] | |
| except Exception: | |
| return [] | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) | |
| data = pytesseract.image_to_data(thresh, output_type=Output.DICT) | |
| width = img.shape[1] | |
| height = img.shape[0] | |
| # Group words into lines | |
| lines_map = {} | |
| for i, text in enumerate(data["text"]): | |
| t = text.strip() | |
| if not t: | |
| continue | |
| key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) | |
| lines_map.setdefault(key, []).append({ | |
| "text": t, | |
| "left": data["left"][i], | |
| "top": data["top"][i], | |
| }) | |
| lines = [] | |
| for key, tokens in lines_map.items(): | |
| tokens_sorted = sorted(tokens, key=lambda t: t["left"]) | |
| line_text = " ".join(t["text"] for t in tokens_sorted) | |
| top = min(t["top"] for t in tokens_sorted) | |
| left = min(t["left"] for t in tokens_sorted) | |
| lines.append({"tokens": tokens_sorted, "text": line_text, "top": top, "left": left}) | |
| # Sort by vertical position | |
| lines.sort(key=lambda l: l["top"]) | |
| # Heuristic: ignore header area (top 30% of the page) | |
| header_cutoff = int(height * 0.3) | |
| # Find candidate question-number lines | |
| raw_q_indices = [] | |
| raw_q_numbers = [] | |
| for idx, line in enumerate(lines): | |
| if line["top"] < header_cutoff: | |
| continue | |
| text_lower = line["text"].lower() | |
| # Skip obvious table header rows for VIT-style papers | |
| if "q. no" in text_lower or "q no" in text_lower: | |
| continue | |
| if "description" in text_lower and "marks" in text_lower: | |
| continue | |
| # Consider only the first alphanumeric token on the line to avoid | |
| # picking up numbers that appear in the middle of sentences. | |
| first_tok = None | |
| for tok in line["tokens"]: | |
| t = tok["text"] | |
| if not t: | |
| continue | |
| if not any(ch.isalnum() for ch in t): | |
| continue | |
| first_tok = tok | |
| break | |
| # Require a pure integer token in a reasonable range, in the | |
| # left-most part of the page (Q.No column in VIT tables). | |
| if first_tok and first_tok["text"].isdigit(): | |
| n = int(first_tok["text"]) | |
| if 1 <= n <= 50 and first_tok["left"] < width * 0.2: | |
| raw_q_indices.append(idx) | |
| raw_q_numbers.append(n) | |
| # Deduplicate by question number, keeping the first occurrence of | |
| # each number in top-to-bottom order. This avoids treating repeated | |
| # references to the same question as separate questions. | |
| seen = set() | |
| q_indices = [] | |
| q_numbers = [] | |
| for idx, n in zip(raw_q_indices, raw_q_numbers): | |
| if n in seen: | |
| continue | |
| seen.add(n) | |
| q_indices.append(idx) | |
| q_numbers.append(n) | |
| # If we found three or more distinct question numbers, trust them. | |
| if len(q_indices) >= 3: | |
| questions = [] | |
| for i, idx in enumerate(q_indices): | |
| start = idx | |
| end = q_indices[i + 1] if i + 1 < len(q_indices) else len(lines) | |
| # Concatenate text from this line to the one before the next question number | |
| chunk_lines = [lines[j]["text"] for j in range(start, end)] | |
| q_text = " ".join(chunk_lines).strip() | |
| # Strip leading number / bullets at the very start only. | |
| q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text) | |
| # Try to find marks inside text; otherwise default | |
| m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE) | |
| marks = m.group(1) if m else "10" | |
| questions.append({ | |
| "number": str(q_numbers[i]), | |
| "question": q_text, | |
| "marks": marks, | |
| }) | |
| return questions | |
| # Otherwise (0–2 detected numbers), fall back to paragraph segmentation | |
| # based on vertical gaps between lines. | |
| body_lines = [ | |
| line for line in lines | |
| if line["top"] >= header_cutoff and any(c.isalpha() for c in line["text"]) | |
| ] | |
| if not body_lines: | |
| return [] | |
| # Compute vertical spacings between consecutive body lines | |
| spacings = [ | |
| body_lines[i + 1]["top"] - body_lines[i]["top"] | |
| for i in range(len(body_lines) - 1) | |
| ] | |
| if not spacings: | |
| segments = [body_lines] | |
| else: | |
| spacings_sorted = sorted(spacings) | |
| median_space = spacings_sorted[len(spacings_sorted) // 2] | |
| gap_threshold = max(int(median_space * 2.5), median_space + 10) | |
| segments = [] | |
| current = [body_lines[0]] | |
| for i in range(len(body_lines) - 1): | |
| gap = body_lines[i + 1]["top"] - body_lines[i]["top"] | |
| if gap > gap_threshold: | |
| segments.append(current) | |
| current = [body_lines[i + 1]] | |
| else: | |
| current.append(body_lines[i + 1]) | |
| segments.append(current) | |
| questions = [] | |
| next_number = 1 | |
| for seg in segments: | |
| # Split each segment further based on lines that look like | |
| # question starts (imperative verbs etc.). This helps when a | |
| # single paragraph actually contains multiple subquestions. | |
| sub_starts = [] | |
| for idx, line in enumerate(seg): | |
| if idx == 0 or _line_looks_like_question_start(line["text"]): | |
| sub_starts.append(idx) | |
| if not sub_starts: | |
| sub_starts = [0] | |
| for si, start_idx in enumerate(sub_starts): | |
| end_idx = sub_starts[si + 1] if si + 1 < len(sub_starts) else len(seg) | |
| sub_lines = seg[start_idx:end_idx] | |
| q_text = " ".join(line["text"] for line in sub_lines).strip() | |
| q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text) | |
| m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE) | |
| marks = m.group(1) if m else "10" | |
| questions.append({ | |
| "number": str(next_number), | |
| "question": q_text, | |
| "marks": marks, | |
| }) | |
| next_number += 1 | |
| return questions | |
| def extract_questions_from_text(text: str): | |
| """Generic question extractor working on OCR text lines only. | |
| It looks for either explicit leading numbers (1., 2) etc.) or | |
| imperative-verb / scenario-style starts ("Do", "Design", "You are...", | |
| etc.) to detect question boundaries, and then groups subsequent lines | |
| until the next boundary. This is intended to be subject-agnostic and | |
| work across papers of the same exam model. | |
| """ | |
| lines = [re.sub(r"\s+", " ", l).strip() for l in text.split("\n")] | |
| questions = [] | |
| current_lines = [] | |
| current_number = None | |
| # Check if the usual "Answer all the questions" anchor is present. | |
| body_anchor_re = re.compile(r"answer\s+all\s+the\s+questions", re.IGNORECASE) | |
| has_body_anchor = any(body_anchor_re.search(l) for l in lines) | |
| def flush_question(): | |
| nonlocal current_lines, current_number | |
| if not current_lines: | |
| return | |
| q_text = " ".join(current_lines).strip() | |
| if not q_text: | |
| current_lines = [] | |
| current_number = None | |
| return | |
| # Strip leading numbering/bullets like "1.", "2)" at the very | |
| # start of the string, but do NOT touch digits elsewhere (e.g. | |
| # the "10" in "10 marks"). | |
| q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text) | |
| # Extract marks if present | |
| m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE) | |
| marks = m.group(1) if m else "10" | |
| questions.append({ | |
| 'number': current_number, # may be None, will be filled later | |
| 'question': q_text, | |
| 'marks': marks, | |
| }) | |
| current_lines = [] | |
| current_number = None | |
| # If we did not see the anchor at all (e.g. a cropped mid-table image | |
| # like qp003), treat the entire text as body. | |
| in_body = not has_body_anchor | |
| for line in lines: | |
| if not line: | |
| continue | |
| # Detect start of question section when an explicit anchor is | |
| # present in the page (full question paper images). | |
| if not in_body: | |
| if body_anchor_re.search(line): | |
| in_body = True | |
| continue | |
| # Skip table header row | |
| if re.search(r"question\s+description", line, re.IGNORECASE): | |
| continue | |
| # Subparts like "a)" / "b)" should be attached to current question. | |
| # Allow for leading table pipes or bullets before the letter, and | |
| # handle occasional OCR mangling like "¢." for "c.". | |
| if re.match(r"^[^A-Za-z0-9]*[a-dA-D¢][).]\s+", line): | |
| if current_lines: | |
| current_lines.append(line) | |
| continue | |
| # Check for explicit numeric question number | |
| num_match = re.match(r"^(\d+)[).]?\s+(.+)$", line) | |
| is_new = False | |
| new_number = None | |
| rest = None | |
| if num_match: | |
| new_number = num_match.group(1) | |
| rest = num_match.group(2) | |
| rest_stripped = rest.lstrip() | |
| # Treat this as a new question only if the text following the | |
| # number actually looks like a question start (scenario or | |
| # imperative) or at least begins with an uppercase letter. This | |
| # avoids misclassifying lines like "3 latency ,trigger ..." in | |
| # qp003, where the number is just a formatting artefact. | |
| if rest_stripped and ( | |
| rest_stripped[0].isupper() or _line_looks_like_question_start(rest_stripped) | |
| ): | |
| is_new = True | |
| # If no explicit number, fall back to verb-based start detection | |
| if not is_new and _line_looks_like_question_start(line): | |
| is_new = True | |
| if is_new: | |
| flush_question() | |
| current_number = new_number | |
| current_lines = [rest if (new_number and rest) else line] | |
| else: | |
| # Continuation of current question (or stray text); append if we | |
| # already have a question started. | |
| if current_lines: | |
| current_lines.append(line) | |
| flush_question() | |
| # Backfill missing numbers sequentially | |
| for idx, q in enumerate(questions, 1): | |
| if not q['number']: | |
| q['number'] = str(idx) | |
| return questions | |
| def extract_questions_with_marks(text): | |
| """ | |
| Legacy text-based extractor (kept as fallback if needed). | |
| """ | |
| questions = [] | |
| # Split text into lines | |
| lines = text.split('\n') | |
| # We will detect question numbers in a robust way (e.g. "1.", "1)", | |
| # possibly preceded by bullet characters or quotes). | |
| current_question = None | |
| current_number = None | |
| marks_found = False | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i].strip() | |
| # Skip empty lines | |
| if not line: | |
| i += 1 | |
| continue | |
| # Check if this is a question number. | |
| # Allow for leading non-digit chars (quotes, bullets) and either | |
| # a dot or closing parenthesis after the number, e.g. "1.", "1)", "• 1.". | |
| match = re.match(r'^\D*(\d+)[).]?\s*(.*)$', line) | |
| if match: | |
| # Heuristic: ignore matches where there is no alphabetic | |
| # character in the remainder; this filters out things like | |
| # isolated years or roll numbers. | |
| remainder = match.group(2) | |
| if not re.search(r'[A-Za-z]', remainder): | |
| i += 1 | |
| continue | |
| # Save previous question if exists | |
| if current_question and current_number: | |
| questions.append({ | |
| 'number': current_number, | |
| 'question': current_question.strip(), | |
| 'marks': '10' if not marks_found else 'marks found in text' | |
| }) | |
| current_number = match.group(1) | |
| current_question = match.group(2) if match.group(2) else "" | |
| marks_found = False | |
| # Look ahead for question content and marks | |
| j = i + 1 | |
| while j < len(lines) and j < i + 10: # Look at next 10 lines max | |
| next_line = lines[j].strip() | |
| # Stop if we hit another question number | |
| if re.match(r'^(\d+)\.\s*', next_line): | |
| break | |
| # Add to current question | |
| if next_line: | |
| current_question += " " + next_line | |
| # Check for marks | |
| marks_match = re.search(r'\((\d+)\s*marks?\)', next_line, re.IGNORECASE) | |
| if not marks_match: | |
| marks_match = re.search(r'(\d+)\s*marks?', next_line, re.IGNORECASE) | |
| if marks_match: | |
| marks_found = True | |
| # Extract the marks and clean the question text | |
| marks = marks_match.group(1) | |
| questions.append({ | |
| 'number': current_number, | |
| 'question': re.sub(r'\s*\(\d+\s*marks?\)\s*', '', current_question).strip(), | |
| 'marks': marks | |
| }) | |
| current_question = None | |
| current_number = None | |
| break | |
| j += 1 | |
| i = j if current_question is None else i + 1 | |
| else: | |
| i += 1 | |
| # Add the last question if exists | |
| if current_question and current_number: | |
| questions.append({ | |
| 'number': current_number, | |
| 'question': current_question.strip(), | |
| 'marks': '10' # Default marks if not found | |
| }) | |
| # If no questions found, try to extract from table format | |
| if not questions: | |
| # Look for patterns like the ones in the image | |
| table_pattern = r'(\d+)\s*\|.*?\|.*?\|\s*(\d+)\s*\|' | |
| for i, line in enumerate(lines): | |
| match = re.search(table_pattern, line) | |
| if match: | |
| q_num = match.group(1) | |
| marks = match.group(2) | |
| # Find the question text (might be in surrounding lines) | |
| question_text = "" | |
| for j in range(max(0, i-5), min(len(lines), i+5)): | |
| if 'city council' in lines[j].lower() or 'smart' in lines[j].lower() or 'agriculture' in lines[j].lower(): | |
| question_text = lines[j].strip() | |
| break | |
| if question_text: | |
| questions.append({ | |
| 'number': q_num, | |
| 'question': question_text, | |
| 'marks': marks | |
| }) | |
| return questions | |
| def process_question_paper(image_path, output_path): | |
| """Process a question paper image and save the extracted content. | |
| This function is fully subject-agnostic: it runs OCR, infers a | |
| subject line from generic headers, extracts questions using generic | |
| heuristics, and writes a structured text file (subject, total | |
| questions, and numbered questions with marks). | |
| """ | |
| print(f"Processing: {image_path}") | |
| # Extract text and subject | |
| text = extract_text_from_image(image_path) | |
| subject = extract_subject_name(text) | |
| # 1) Try layout-based extraction first (uses Tesseract's positional | |
| # data to find question numbers in the left column). This is | |
| # particularly robust for table-style papers like VIT's CAT format. | |
| questions = extract_questions_with_layout(image_path) | |
| # 2) If that fails or finds too few questions, fall back to the | |
| # generic text-line based extractor which uses only OCR'd text. | |
| if not questions or len(questions) < 3: | |
| questions = extract_questions_from_text(text) | |
| # Write out the results in a structured layout | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(f"Subject: {subject}\\n\\n") | |
| f.write(f"Total Questions: {len(questions)}\\n\\n") | |
| f.write("QUESTIONS\\n\\n") | |
| for q in questions: | |
| f.write(f"Q{q['number']} ({q['marks']} marks):\\n") | |
| f.write(f"{q['question']}\\n\\n") | |
| print(f"Extracted content saved to: {output_path}") | |
| return subject, questions | |
| def process_pdf_question_paper(pdf_path, output_path): | |
| """Process a PDF question paper by converting each page to an image. | |
| Each page is run through the same OCR + text-based question extractor, | |
| and all questions are combined into a single output text file. | |
| This function is defensive: if PDF support or poppler is missing, it | |
| writes a small diagnostic file instead of raising, so hf_predict can | |
| always read *something* from ``output_path``. | |
| """ | |
| if convert_from_path is None: | |
| msg_lines = [ | |
| "ERROR: PDF support requires the 'pdf2image' package.", | |
| "Install it in the environment, e.g.: pip install pdf2image", | |
| ] | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write("Subject: Unknown Subject\n\n") | |
| f.write("Total Questions: 0\n\n") | |
| f.write("QUESTIONS\n\n") | |
| f.write("\n".join(msg_lines)) | |
| print("\n".join(msg_lines)) | |
| return "Unknown Subject", [] | |
| print(f"Processing PDF: {pdf_path}") | |
| all_questions = [] | |
| subject = None | |
| # Create temporary images for each page and clean them up afterwards | |
| pdf_dir = os.path.dirname(os.path.abspath(pdf_path)) or os.getcwd() | |
| base_name = os.path.splitext(os.path.basename(pdf_path))[0] | |
| with tempfile.TemporaryDirectory(prefix="qp_pdf_", dir=pdf_dir) as tmp_dir: | |
| try: | |
| # Use a moderate DPI to keep page images manageable while | |
| # still giving good OCR quality. | |
| pages = convert_from_path(pdf_path, dpi=200) | |
| except Exception as e: | |
| err = f"ERROR: Failed to convert PDF to images: {e}" | |
| print(err) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write("Subject: Unknown Subject\n\n") | |
| f.write("Total Questions: 0\n\n") | |
| f.write("QUESTIONS\n\n") | |
| f.write(err) | |
| return "Unknown Subject", [] | |
| image_paths = [] | |
| for idx, page in enumerate(pages, start=1): | |
| img_path = os.path.join(tmp_dir, f"{base_name}_page_{idx}.png") | |
| page.save(img_path, "PNG") | |
| image_paths.append(img_path) | |
| for idx, img_path in enumerate(image_paths, start=1): | |
| # Reuse the same core logic as process_question_paper, but avoid | |
| # writing per-page outputs; we aggregate instead. | |
| text = extract_text_from_image(img_path) | |
| page_subject = extract_subject_name(text) | |
| if subject is None or subject == "Unknown Subject": | |
| subject = page_subject | |
| page_questions = extract_questions_from_text(text) | |
| all_questions.extend(page_questions) | |
| if subject is None: | |
| subject = "Unknown Subject" | |
| # Write combined results for the whole PDF | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(f"Subject: {subject}\n\n") | |
| f.write(f"Total Questions: {len(all_questions)}\n\n") | |
| f.write("QUESTIONS\n\n") | |
| for q in all_questions: | |
| f.write(f"Q{q['number']} ({q['marks']} marks):\n") | |
| f.write(f"{q['question']}\n\n") | |
| print(f"Extracted content saved to: {output_path}") | |
| return subject, all_questions | |
| def hf_predict(file): | |
| """Hugging Face Spaces-compatible prediction function. | |
| This wraps the existing extraction pipeline so it can be used as a | |
| model endpoint. It accepts an uploaded image/PDF and returns a single | |
| text blob containing the subject and all extracted questions. | |
| Parameters | |
| ---------- | |
| file : str or file-like | |
| Path to an image/PDF or a file object (as provided by Gradio). | |
| Returns | |
| ------- | |
| str | |
| The contents of the generated *_questions.txt file (subject and | |
| numbered questions with marks). | |
| """ | |
| # Resolve the filesystem path from the incoming object | |
| if isinstance(file, str): | |
| input_path = file | |
| else: | |
| input_path = getattr(file, "name", None) | |
| if input_path is None: | |
| raise ValueError("Unsupported file input type for hf_predict") | |
| ext = os.path.splitext(input_path)[1].lower() | |
| with tempfile.TemporaryDirectory(prefix="hf_qp_") as tmp_dir: | |
| base_name = os.path.splitext(os.path.basename(input_path))[0] | |
| output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt") | |
| if ext == ".pdf": | |
| subject, questions = process_pdf_question_paper(input_path, output_path) | |
| else: | |
| subject, questions = process_question_paper(input_path, output_path) | |
| # In normal cases process_* will have written output_path. If it | |
| # did not (for some unexpected error), fall back to an in-memory | |
| # text construction instead of raising FileNotFoundError. | |
| if not os.path.exists(output_path): | |
| lines = [ | |
| f"Subject: {subject}", | |
| "", | |
| f"Total Questions: {len(questions)}", | |
| "", | |
| "QUESTIONS", | |
| "", | |
| ] | |
| for q in questions: | |
| lines.append(f"Q{q['number']} ({q['marks']} marks):") | |
| lines.append(q['question']) | |
| lines.append("") | |
| return "\n".join(lines) | |
| with open(output_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def main(): | |
| """Entry point. | |
| Usage: | |
| python question_extractor.py image1.jpg image2.png | |
| If no image paths are passed, it falls back to processing all | |
| images in the same folder as this script (current behaviour). | |
| """ | |
| base_dir = os.path.dirname(os.path.abspath(__file__)) | |
| image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.pdf'] | |
| # If the user passed image paths on the command line, use those | |
| if len(sys.argv) > 1: | |
| image_files = sys.argv[1:] | |
| else: | |
| # Fallback to scanning this folder for images | |
| folder_path = base_dir | |
| image_files = [ | |
| os.path.join(folder_path, file) | |
| for file in os.listdir(folder_path) | |
| if any(file.lower().endswith(ext) for ext in image_extensions) | |
| ] | |
| if not image_files: | |
| print("No image files provided and none found in the questionPaperExtractor folder") | |
| return | |
| # Process each input path (image or PDF) | |
| for i, input_path in enumerate(image_files, 1): | |
| abs_input_path = os.path.abspath(input_path) | |
| in_dir = os.path.dirname(abs_input_path) or base_dir | |
| base_name = os.path.splitext(os.path.basename(abs_input_path))[0] | |
| ext = os.path.splitext(abs_input_path)[1].lower() | |
| output_filename = os.path.join(in_dir, f"{base_name}_questions.txt") | |
| if ext == '.pdf': | |
| subject, questions = process_pdf_question_paper(abs_input_path, output_filename) | |
| else: | |
| subject, questions = process_question_paper(abs_input_path, output_filename) | |
| print(f"\n{'='*50}") | |
| print(f"Input {i}: {os.path.basename(abs_input_path)}") | |
| print(f"Subject: {subject}") | |
| print(f"Number of questions extracted: {len(questions)}") | |
| print(f"Output saved to: {output_filename}") | |
| print('='*50) | |
| if __name__ == "__main__": | |
| main() |