class QuestionPaper: def __init__(self, path=None): self.questions = [] self.answers = [] self.path = path def clean_answers(self): # Remove unwanted patterns from answers unwanted_patterns = [ "Time: 15 MinutesMarks: 20", "Time: 15 Minutes Marks: 20", "GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS", "GENERAL KNOWLEDGE QUESTION PAPER", "" # Empty strings ] # Filter out unwanted answers cleaned_answers = [] for answer in self.answers: if answer.strip() and answer.strip() not in unwanted_patterns: # Also check if it doesn't match any unwanted pattern with regex is_unwanted = False for pattern in unwanted_patterns: if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE): is_unwanted = True break if not is_unwanted: cleaned_answers.append(answer.strip()) self.answers = cleaned_answers def add_question(self, question_text): self.questions.append(question_text) def add_answer(self, answer_text): self.answers.append(answer_text) def to_dict(self): return { 'questions': self.questions, 'answers': self.answers } def parse_question_paper_text(text): """ Improved parsing function that correctly identifies questions and answers """ lines = [line.strip() for line in text.split('\n') if line.strip()] questions = [] answers = [] # Patterns to ignore (headers, footers, etc.) ignore_patterns = [ r'GENERAL KNOWLEDGE QUESTION PAPER.*', r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+', r'Time:\s*\d+\s*MinutesMarks:\s*\d+', r'^\s*$' # Empty lines ] # Filter out unwanted lines filtered_lines = [] for line in lines: should_ignore = False for pattern in ignore_patterns: if re.match(pattern, line, re.IGNORECASE): should_ignore = True break if not should_ignore: filtered_lines.append(line) # Pattern to identify questions (starts with number followed by dot/parenthesis) question_pattern = r'^\d+\s*[.)]\s*(.+)' i = 0 while i < len(filtered_lines): current_line = filtered_lines[i].strip() # Check if current line is a question question_match = re.match(question_pattern, current_line) if question_match: # This is a question question_text = question_match.group(1).strip() questions.append(f"{current_line}") # Keep the full question with number # Look for the answer in the next line if i + 1 < len(filtered_lines): next_line = filtered_lines[i + 1].strip() # If next line is not a question (doesn't start with number), it's likely an answer if not re.match(question_pattern, next_line): answers.append(next_line) i += 2 # Skip both question and answer else: # Next line is also a question, so this question might not have an answer # Or the answer might be embedded in the same line # Try to extract answer from the question line itself if it contains common answer patterns answers.append("") # Placeholder for missing answer i += 1 else: # Last line and it's a question without answer answers.append("") i += 1 else: # This line doesn't match question pattern, skip it or try to pair it with previous question if len(questions) > len(answers): # We have more questions than answers, this might be an answer answers.append(current_line) i += 1 # Ensure we have equal number of questions and answers while len(answers) < len(questions): answers.append("") while len(questions) < len(answers): questions.append(f"Question {len(questions) + 1}") return questions, answers @app.route('/process_question_paper', methods=['POST']) def process_question_paper(): global last_processed_question_paper_object if 'file' not in request.files: return jsonify({'error': 'No file provided'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 question_paper = QuestionPaper() try: # Create Images directory if it doesn't exist images_dir = os.path.join(app.root_path, 'Images') os.makedirs(images_dir, exist_ok=True) if file.filename.lower().endswith('.pdf'): question_paper_filename = "question_paper.pdf" question_paper_path = os.path.join(images_dir, question_paper_filename) file.save(question_paper_path) # Initialize the global object with the path question_paper.path = question_paper_path # For PDF processing images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin') all_text = "" for page_image in images_from_pdf: text = pytesseract.image_to_string(page_image) all_text += text + "\n" # Use improved parsing questions, answers = parse_question_paper_text(all_text) question_paper.questions = questions question_paper.answers = answers else: # Process as image question_paper_filename = "question_paper.png" question_paper_path = os.path.join(images_dir, question_paper_filename) file.save(question_paper_path) question_paper.path = question_paper_path image = Image.open(question_paper_path) text = pytesseract.image_to_string(image) # Use improved parsing questions, answers = parse_question_paper_text(text) question_paper.questions = questions question_paper.answers = answers # Clean the answers (remove any remaining unwanted patterns) question_paper.clean_answers() # Store the processed question paper globally last_processed_question_paper_object = question_paper return jsonify(question_paper.to_dict()) except Exception as e: return jsonify({'error': str(e)}), 500