ocr-omr-backend / server.py
AswinMathew's picture
Deploy OCR/OMR backend to HF Spaces
b8548e4 verified
raw
history blame
7.11 kB
class QuestionPaper:
def __init__(self, path=None):
self.questions = []
self.answers = []
self.path = path
def clean_answers(self):
# Remove unwanted patterns from answers
unwanted_patterns = [
"Time: 15 MinutesMarks: 20",
"Time: 15 Minutes Marks: 20",
"GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS",
"GENERAL KNOWLEDGE QUESTION PAPER",
"" # Empty strings
]
# Filter out unwanted answers
cleaned_answers = []
for answer in self.answers:
if answer.strip() and answer.strip() not in unwanted_patterns:
# Also check if it doesn't match any unwanted pattern with regex
is_unwanted = False
for pattern in unwanted_patterns:
if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE):
is_unwanted = True
break
if not is_unwanted:
cleaned_answers.append(answer.strip())
self.answers = cleaned_answers
def add_question(self, question_text):
self.questions.append(question_text)
def add_answer(self, answer_text):
self.answers.append(answer_text)
def to_dict(self):
return {
'questions': self.questions,
'answers': self.answers
}
def parse_question_paper_text(text):
"""
Improved parsing function that correctly identifies questions and answers
"""
lines = [line.strip() for line in text.split('\n') if line.strip()]
questions = []
answers = []
# Patterns to ignore (headers, footers, etc.)
ignore_patterns = [
r'GENERAL KNOWLEDGE QUESTION PAPER.*',
r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+',
r'Time:\s*\d+\s*MinutesMarks:\s*\d+',
r'^\s*$' # Empty lines
]
# Filter out unwanted lines
filtered_lines = []
for line in lines:
should_ignore = False
for pattern in ignore_patterns:
if re.match(pattern, line, re.IGNORECASE):
should_ignore = True
break
if not should_ignore:
filtered_lines.append(line)
# Pattern to identify questions (starts with number followed by dot/parenthesis)
question_pattern = r'^\d+\s*[.)]\s*(.+)'
i = 0
while i < len(filtered_lines):
current_line = filtered_lines[i].strip()
# Check if current line is a question
question_match = re.match(question_pattern, current_line)
if question_match:
# This is a question
question_text = question_match.group(1).strip()
questions.append(f"{current_line}") # Keep the full question with number
# Look for the answer in the next line
if i + 1 < len(filtered_lines):
next_line = filtered_lines[i + 1].strip()
# If next line is not a question (doesn't start with number), it's likely an answer
if not re.match(question_pattern, next_line):
answers.append(next_line)
i += 2 # Skip both question and answer
else:
# Next line is also a question, so this question might not have an answer
# Or the answer might be embedded in the same line
# Try to extract answer from the question line itself if it contains common answer patterns
answers.append("") # Placeholder for missing answer
i += 1
else:
# Last line and it's a question without answer
answers.append("")
i += 1
else:
# This line doesn't match question pattern, skip it or try to pair it with previous question
if len(questions) > len(answers):
# We have more questions than answers, this might be an answer
answers.append(current_line)
i += 1
# Ensure we have equal number of questions and answers
while len(answers) < len(questions):
answers.append("")
while len(questions) < len(answers):
questions.append(f"Question {len(questions) + 1}")
return questions, answers
@app.route('/process_question_paper', methods=['POST'])
def process_question_paper():
global last_processed_question_paper_object
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
question_paper = QuestionPaper()
try:
# Create Images directory if it doesn't exist
images_dir = os.path.join(app.root_path, 'Images')
os.makedirs(images_dir, exist_ok=True)
if file.filename.lower().endswith('.pdf'):
question_paper_filename = "question_paper.pdf"
question_paper_path = os.path.join(images_dir, question_paper_filename)
file.save(question_paper_path)
# Initialize the global object with the path
question_paper.path = question_paper_path
# For PDF processing
images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin')
all_text = ""
for page_image in images_from_pdf:
text = pytesseract.image_to_string(page_image)
all_text += text + "\n"
# Use improved parsing
questions, answers = parse_question_paper_text(all_text)
question_paper.questions = questions
question_paper.answers = answers
else:
# Process as image
question_paper_filename = "question_paper.png"
question_paper_path = os.path.join(images_dir, question_paper_filename)
file.save(question_paper_path)
question_paper.path = question_paper_path
image = Image.open(question_paper_path)
text = pytesseract.image_to_string(image)
# Use improved parsing
questions, answers = parse_question_paper_text(text)
question_paper.questions = questions
question_paper.answers = answers
# Clean the answers (remove any remaining unwanted patterns)
question_paper.clean_answers()
# Store the processed question paper globally
last_processed_question_paper_object = question_paper
return jsonify(question_paper.to_dict())
except Exception as e:
return jsonify({'error': str(e)}), 500