File size: 7,106 Bytes
b8548e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class QuestionPaper:
    def __init__(self, path=None):
        self.questions = []
        self.answers = []
        self.path = path

    def clean_answers(self):
        # Remove unwanted patterns from answers
        unwanted_patterns = [
            "Time: 15 MinutesMarks: 20",
            "Time: 15 Minutes Marks: 20",
            "GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS",
            "GENERAL KNOWLEDGE QUESTION PAPER",
            ""  # Empty strings
        ]
        
        # Filter out unwanted answers
        cleaned_answers = []
        for answer in self.answers:
            if answer.strip() and answer.strip() not in unwanted_patterns:
                # Also check if it doesn't match any unwanted pattern with regex
                is_unwanted = False
                for pattern in unwanted_patterns:
                    if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE):
                        is_unwanted = True
                        break
                if not is_unwanted:
                    cleaned_answers.append(answer.strip())
        
        self.answers = cleaned_answers
    
    def add_question(self, question_text):
        self.questions.append(question_text)
    
    def add_answer(self, answer_text):
        self.answers.append(answer_text)
    
    def to_dict(self):
        return {
            'questions': self.questions,
            'answers': self.answers
        }

def parse_question_paper_text(text):
    """

    Improved parsing function that correctly identifies questions and answers

    """
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    questions = []
    answers = []
    
    # Patterns to ignore (headers, footers, etc.)
    ignore_patterns = [
        r'GENERAL KNOWLEDGE QUESTION PAPER.*',
        r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+',
        r'Time:\s*\d+\s*MinutesMarks:\s*\d+',
        r'^\s*$'  # Empty lines
    ]
    
    # Filter out unwanted lines
    filtered_lines = []
    for line in lines:
        should_ignore = False
        for pattern in ignore_patterns:
            if re.match(pattern, line, re.IGNORECASE):
                should_ignore = True
                break
        if not should_ignore:
            filtered_lines.append(line)
    
    # Pattern to identify questions (starts with number followed by dot/parenthesis)
    question_pattern = r'^\d+\s*[.)]\s*(.+)'
    
    i = 0
    while i < len(filtered_lines):
        current_line = filtered_lines[i].strip()
        
        # Check if current line is a question
        question_match = re.match(question_pattern, current_line)
        if question_match:
            # This is a question
            question_text = question_match.group(1).strip()
            questions.append(f"{current_line}")  # Keep the full question with number
            
            # Look for the answer in the next line
            if i + 1 < len(filtered_lines):
                next_line = filtered_lines[i + 1].strip()
                # If next line is not a question (doesn't start with number), it's likely an answer
                if not re.match(question_pattern, next_line):
                    answers.append(next_line)
                    i += 2  # Skip both question and answer
                else:
                    # Next line is also a question, so this question might not have an answer
                    # Or the answer might be embedded in the same line
                    # Try to extract answer from the question line itself if it contains common answer patterns
                    answers.append("")  # Placeholder for missing answer
                    i += 1
            else:
                # Last line and it's a question without answer
                answers.append("")
                i += 1
        else:
            # This line doesn't match question pattern, skip it or try to pair it with previous question
            if len(questions) > len(answers):
                # We have more questions than answers, this might be an answer
                answers.append(current_line)
            i += 1
    
    # Ensure we have equal number of questions and answers
    while len(answers) < len(questions):
        answers.append("")
    while len(questions) < len(answers):
        questions.append(f"Question {len(questions) + 1}")
    
    return questions, answers

@app.route('/process_question_paper', methods=['POST'])
def process_question_paper():
    global last_processed_question_paper_object
    
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400
    
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400
    
    question_paper = QuestionPaper()
    
    try:
        # Create Images directory if it doesn't exist
        images_dir = os.path.join(app.root_path, 'Images')
        os.makedirs(images_dir, exist_ok=True)
        
        if file.filename.lower().endswith('.pdf'):
            question_paper_filename = "question_paper.pdf"
            question_paper_path = os.path.join(images_dir, question_paper_filename)
            file.save(question_paper_path)
            
            # Initialize the global object with the path
            question_paper.path = question_paper_path
            
            # For PDF processing
            images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin')
            
            all_text = ""
            for page_image in images_from_pdf:
                text = pytesseract.image_to_string(page_image)
                all_text += text + "\n"
            
            # Use improved parsing
            questions, answers = parse_question_paper_text(all_text)
            question_paper.questions = questions
            question_paper.answers = answers
        
        else:
            # Process as image
            question_paper_filename = "question_paper.png"
            question_paper_path = os.path.join(images_dir, question_paper_filename)
            file.save(question_paper_path)
            
            question_paper.path = question_paper_path
            
            image = Image.open(question_paper_path)
            text = pytesseract.image_to_string(image)
            
            # Use improved parsing
            questions, answers = parse_question_paper_text(text)
            question_paper.questions = questions
            question_paper.answers = answers
        
        # Clean the answers (remove any remaining unwanted patterns)
        question_paper.clean_answers()
        
        # Store the processed question paper globally
        last_processed_question_paper_object = question_paper
        
        return jsonify(question_paper.to_dict())
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500