Spaces:

AswinMathew
/

ocr-omr-backend

Running

File size: 7,106 Bytes

b8548e4

class QuestionPaper:
    def __init__(self, path=None):
        self.questions = []
        self.answers = []
        self.path = path

    def clean_answers(self):
        # Remove unwanted patterns from answers
        unwanted_patterns = [
            "Time: 15 MinutesMarks: 20",
            "Time: 15 Minutes Marks: 20",
            "GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS",
            "GENERAL KNOWLEDGE QUESTION PAPER",
            ""  # Empty strings
        ]
        
        # Filter out unwanted answers
        cleaned_answers = []
        for answer in self.answers:
            if answer.strip() and answer.strip() not in unwanted_patterns:
                # Also check if it doesn't match any unwanted pattern with regex
                is_unwanted = False
                for pattern in unwanted_patterns:
                    if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE):
                        is_unwanted = True
                        break
                if not is_unwanted:
                    cleaned_answers.append(answer.strip())
        
        self.answers = cleaned_answers
    
    def add_question(self, question_text):
        self.questions.append(question_text)
    
    def add_answer(self, answer_text):
        self.answers.append(answer_text)
    
    def to_dict(self):
        return {
            'questions': self.questions,
            'answers': self.answers
        }

def parse_question_paper_text(text):
    """

    Improved parsing function that correctly identifies questions and answers

    """
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    questions = []
    answers = []
    
    # Patterns to ignore (headers, footers, etc.)
    ignore_patterns = [
        r'GENERAL KNOWLEDGE QUESTION PAPER.*',
        r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+',
        r'Time:\s*\d+\s*MinutesMarks:\s*\d+',
        r'^\s*$'  # Empty lines
    ]
    
    # Filter out unwanted lines
    filtered_lines = []
    for line in lines:
        should_ignore = False
        for pattern in ignore_patterns:
            if re.match(pattern, line, re.IGNORECASE):
                should_ignore = True
                break
        if not should_ignore:
            filtered_lines.append(line)
    
    # Pattern to identify questions (starts with number followed by dot/parenthesis)
    question_pattern = r'^\d+\s*[.)]\s*(.+)'
    
    i = 0
    while i < len(filtered_lines):
        current_line = filtered_lines[i].strip()
        
        # Check if current line is a question
        question_match = re.match(question_pattern, current_line)
        if question_match:
            # This is a question
            question_text = question_match.group(1).strip()
            questions.append(f"{current_line}")  # Keep the full question with number
            
            # Look for the answer in the next line
            if i + 1 < len(filtered_lines):
                next_line = filtered_lines[i + 1].strip()
                # If next line is not a question (doesn't start with number), it's likely an answer
                if not re.match(question_pattern, next_line):
                    answers.append(next_line)
                    i += 2  # Skip both question and answer
                else:
                    # Next line is also a question, so this question might not have an answer
                    # Or the answer might be embedded in the same line
                    # Try to extract answer from the question line itself if it contains common answer patterns
                    answers.append("")  # Placeholder for missing answer
                    i += 1
            else:
                # Last line and it's a question without answer
                answers.append("")
                i += 1
        else:
            # This line doesn't match question pattern, skip it or try to pair it with previous question
            if len(questions) > len(answers):
                # We have more questions than answers, this might be an answer
                answers.append(current_line)
            i += 1
    
    # Ensure we have equal number of questions and answers
    while len(answers) < len(questions):
        answers.append("")
    while len(questions) < len(answers):
        questions.append(f"Question {len(questions) + 1}")
    
    return questions, answers

@app.route('/process_question_paper', methods=['POST'])
def process_question_paper():
    global last_processed_question_paper_object
    
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400
    
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400
    
    question_paper = QuestionPaper()
    
    try:
        # Create Images directory if it doesn't exist
        images_dir = os.path.join(app.root_path, 'Images')
        os.makedirs(images_dir, exist_ok=True)
        
        if file.filename.lower().endswith('.pdf'):
            question_paper_filename = "question_paper.pdf"
            question_paper_path = os.path.join(images_dir, question_paper_filename)
            file.save(question_paper_path)
            
            # Initialize the global object with the path
            question_paper.path = question_paper_path
            
            # For PDF processing
            images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin')
            
            all_text = ""
            for page_image in images_from_pdf:
                text = pytesseract.image_to_string(page_image)
                all_text += text + "\n"
            
            # Use improved parsing
            questions, answers = parse_question_paper_text(all_text)
            question_paper.questions = questions
            question_paper.answers = answers
        
        else:
            # Process as image
            question_paper_filename = "question_paper.png"
            question_paper_path = os.path.join(images_dir, question_paper_filename)
            file.save(question_paper_path)
            
            question_paper.path = question_paper_path
            
            image = Image.open(question_paper_path)
            text = pytesseract.image_to_string(image)
            
            # Use improved parsing
            questions, answers = parse_question_paper_text(text)
            question_paper.questions = questions
            question_paper.answers = answers
        
        # Clean the answers (remove any remaining unwanted patterns)
        question_paper.clean_answers()
        
        # Store the processed question paper globally
        last_processed_question_paper_object = question_paper
        
        return jsonify(question_paper.to_dict())
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500