Spaces:

AswinMathew
/

ocr-omr-backend

Paused

App Files Files Community

ocr-omr-backend / server.py

AswinMathew

Deploy OCR/OMR backend to HF Spaces

b8548e4 verified 8 days ago

raw

history blame

7.11 kB

	class QuestionPaper:
	def __init__(self, path=None):
	self.questions = []
	self.answers = []
	self.path = path

	def clean_answers(self):
	# Remove unwanted patterns from answers
	unwanted_patterns = [
	"Time: 15 MinutesMarks: 20",
	"Time: 15 Minutes Marks: 20",
	"GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS",
	"GENERAL KNOWLEDGE QUESTION PAPER",
	"" # Empty strings
	]

	# Filter out unwanted answers
	cleaned_answers = []
	for answer in self.answers:
	if answer.strip() and answer.strip() not in unwanted_patterns:
	# Also check if it doesn't match any unwanted pattern with regex
	is_unwanted = False
	for pattern in unwanted_patterns:
	if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE):
	is_unwanted = True
	break
	if not is_unwanted:
	cleaned_answers.append(answer.strip())

	self.answers = cleaned_answers

	def add_question(self, question_text):
	self.questions.append(question_text)

	def add_answer(self, answer_text):
	self.answers.append(answer_text)

	def to_dict(self):
	return {
	'questions': self.questions,
	'answers': self.answers
	}

	def parse_question_paper_text(text):
	"""
	Improved parsing function that correctly identifies questions and answers
	"""
	lines = [line.strip() for line in text.split('\n') if line.strip()]

	questions = []
	answers = []

	# Patterns to ignore (headers, footers, etc.)
	ignore_patterns = [
	r'GENERAL KNOWLEDGE QUESTION PAPER.*',
	r'Time:\s\d+\sMinutes.Marks:\s\d+',
	r'Time:\s\d+\sMinutesMarks:\s*\d+',
	r'^\s*$' # Empty lines
	]

	# Filter out unwanted lines
	filtered_lines = []
	for line in lines:
	should_ignore = False
	for pattern in ignore_patterns:
	if re.match(pattern, line, re.IGNORECASE):
	should_ignore = True
	break
	if not should_ignore:
	filtered_lines.append(line)

	# Pattern to identify questions (starts with number followed by dot/parenthesis)
	question_pattern = r'^\d+\s[.)]\s(.+)'

	i = 0
	while i < len(filtered_lines):
	current_line = filtered_lines[i].strip()

	# Check if current line is a question
	question_match = re.match(question_pattern, current_line)
	if question_match:
	# This is a question
	question_text = question_match.group(1).strip()
	questions.append(f"{current_line}") # Keep the full question with number

	# Look for the answer in the next line
	if i + 1 < len(filtered_lines):
	next_line = filtered_lines[i + 1].strip()
	# If next line is not a question (doesn't start with number), it's likely an answer
	if not re.match(question_pattern, next_line):
	answers.append(next_line)
	i += 2 # Skip both question and answer
	else:
	# Next line is also a question, so this question might not have an answer
	# Or the answer might be embedded in the same line
	# Try to extract answer from the question line itself if it contains common answer patterns
	answers.append("") # Placeholder for missing answer
	i += 1
	else:
	# Last line and it's a question without answer
	answers.append("")
	i += 1
	else:
	# This line doesn't match question pattern, skip it or try to pair it with previous question
	if len(questions) > len(answers):
	# We have more questions than answers, this might be an answer
	answers.append(current_line)
	i += 1

	# Ensure we have equal number of questions and answers
	while len(answers) < len(questions):
	answers.append("")
	while len(questions) < len(answers):
	questions.append(f"Question {len(questions) + 1}")

	return questions, answers

	@app.route('/process_question_paper', methods=['POST'])
	def process_question_paper():
	global last_processed_question_paper_object

	if 'file' not in request.files:
	return jsonify({'error': 'No file provided'}), 400

	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'No file selected'}), 400

	question_paper = QuestionPaper()

	try:
	# Create Images directory if it doesn't exist
	images_dir = os.path.join(app.root_path, 'Images')
	os.makedirs(images_dir, exist_ok=True)

	if file.filename.lower().endswith('.pdf'):
	question_paper_filename = "question_paper.pdf"
	question_paper_path = os.path.join(images_dir, question_paper_filename)
	file.save(question_paper_path)

	# Initialize the global object with the path
	question_paper.path = question_paper_path

	# For PDF processing
	images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin')

	all_text = ""
	for page_image in images_from_pdf:
	text = pytesseract.image_to_string(page_image)
	all_text += text + "\n"

	# Use improved parsing
	questions, answers = parse_question_paper_text(all_text)
	question_paper.questions = questions
	question_paper.answers = answers

	else:
	# Process as image
	question_paper_filename = "question_paper.png"
	question_paper_path = os.path.join(images_dir, question_paper_filename)
	file.save(question_paper_path)

	question_paper.path = question_paper_path

	image = Image.open(question_paper_path)
	text = pytesseract.image_to_string(image)

	# Use improved parsing
	questions, answers = parse_question_paper_text(text)
	question_paper.questions = questions
	question_paper.answers = answers

	# Clean the answers (remove any remaining unwanted patterns)
	question_paper.clean_answers()

	# Store the processed question paper globally
	last_processed_question_paper_object = question_paper

	return jsonify(question_paper.to_dict())

	except Exception as e:
	return jsonify({'error': str(e)}), 500