Spaces:
Paused
Paused
| class QuestionPaper: | |
| def __init__(self, path=None): | |
| self.questions = [] | |
| self.answers = [] | |
| self.path = path | |
| def clean_answers(self): | |
| # Remove unwanted patterns from answers | |
| unwanted_patterns = [ | |
| "Time: 15 MinutesMarks: 20", | |
| "Time: 15 Minutes Marks: 20", | |
| "GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS", | |
| "GENERAL KNOWLEDGE QUESTION PAPER", | |
| "" # Empty strings | |
| ] | |
| # Filter out unwanted answers | |
| cleaned_answers = [] | |
| for answer in self.answers: | |
| if answer.strip() and answer.strip() not in unwanted_patterns: | |
| # Also check if it doesn't match any unwanted pattern with regex | |
| is_unwanted = False | |
| for pattern in unwanted_patterns: | |
| if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE): | |
| is_unwanted = True | |
| break | |
| if not is_unwanted: | |
| cleaned_answers.append(answer.strip()) | |
| self.answers = cleaned_answers | |
| def add_question(self, question_text): | |
| self.questions.append(question_text) | |
| def add_answer(self, answer_text): | |
| self.answers.append(answer_text) | |
| def to_dict(self): | |
| return { | |
| 'questions': self.questions, | |
| 'answers': self.answers | |
| } | |
| def parse_question_paper_text(text): | |
| """ | |
| Improved parsing function that correctly identifies questions and answers | |
| """ | |
| lines = [line.strip() for line in text.split('\n') if line.strip()] | |
| questions = [] | |
| answers = [] | |
| # Patterns to ignore (headers, footers, etc.) | |
| ignore_patterns = [ | |
| r'GENERAL KNOWLEDGE QUESTION PAPER.*', | |
| r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+', | |
| r'Time:\s*\d+\s*MinutesMarks:\s*\d+', | |
| r'^\s*$' # Empty lines | |
| ] | |
| # Filter out unwanted lines | |
| filtered_lines = [] | |
| for line in lines: | |
| should_ignore = False | |
| for pattern in ignore_patterns: | |
| if re.match(pattern, line, re.IGNORECASE): | |
| should_ignore = True | |
| break | |
| if not should_ignore: | |
| filtered_lines.append(line) | |
| # Pattern to identify questions (starts with number followed by dot/parenthesis) | |
| question_pattern = r'^\d+\s*[.)]\s*(.+)' | |
| i = 0 | |
| while i < len(filtered_lines): | |
| current_line = filtered_lines[i].strip() | |
| # Check if current line is a question | |
| question_match = re.match(question_pattern, current_line) | |
| if question_match: | |
| # This is a question | |
| question_text = question_match.group(1).strip() | |
| questions.append(f"{current_line}") # Keep the full question with number | |
| # Look for the answer in the next line | |
| if i + 1 < len(filtered_lines): | |
| next_line = filtered_lines[i + 1].strip() | |
| # If next line is not a question (doesn't start with number), it's likely an answer | |
| if not re.match(question_pattern, next_line): | |
| answers.append(next_line) | |
| i += 2 # Skip both question and answer | |
| else: | |
| # Next line is also a question, so this question might not have an answer | |
| # Or the answer might be embedded in the same line | |
| # Try to extract answer from the question line itself if it contains common answer patterns | |
| answers.append("") # Placeholder for missing answer | |
| i += 1 | |
| else: | |
| # Last line and it's a question without answer | |
| answers.append("") | |
| i += 1 | |
| else: | |
| # This line doesn't match question pattern, skip it or try to pair it with previous question | |
| if len(questions) > len(answers): | |
| # We have more questions than answers, this might be an answer | |
| answers.append(current_line) | |
| i += 1 | |
| # Ensure we have equal number of questions and answers | |
| while len(answers) < len(questions): | |
| answers.append("") | |
| while len(questions) < len(answers): | |
| questions.append(f"Question {len(questions) + 1}") | |
| return questions, answers | |
| def process_question_paper(): | |
| global last_processed_question_paper_object | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file provided'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No file selected'}), 400 | |
| question_paper = QuestionPaper() | |
| try: | |
| # Create Images directory if it doesn't exist | |
| images_dir = os.path.join(app.root_path, 'Images') | |
| os.makedirs(images_dir, exist_ok=True) | |
| if file.filename.lower().endswith('.pdf'): | |
| question_paper_filename = "question_paper.pdf" | |
| question_paper_path = os.path.join(images_dir, question_paper_filename) | |
| file.save(question_paper_path) | |
| # Initialize the global object with the path | |
| question_paper.path = question_paper_path | |
| # For PDF processing | |
| images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin') | |
| all_text = "" | |
| for page_image in images_from_pdf: | |
| text = pytesseract.image_to_string(page_image) | |
| all_text += text + "\n" | |
| # Use improved parsing | |
| questions, answers = parse_question_paper_text(all_text) | |
| question_paper.questions = questions | |
| question_paper.answers = answers | |
| else: | |
| # Process as image | |
| question_paper_filename = "question_paper.png" | |
| question_paper_path = os.path.join(images_dir, question_paper_filename) | |
| file.save(question_paper_path) | |
| question_paper.path = question_paper_path | |
| image = Image.open(question_paper_path) | |
| text = pytesseract.image_to_string(image) | |
| # Use improved parsing | |
| questions, answers = parse_question_paper_text(text) | |
| question_paper.questions = questions | |
| question_paper.answers = answers | |
| # Clean the answers (remove any remaining unwanted patterns) | |
| question_paper.clean_answers() | |
| # Store the processed question paper globally | |
| last_processed_question_paper_object = question_paper | |
| return jsonify(question_paper.to_dict()) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 |