Spaces:
Running
Running
File size: 7,106 Bytes
b8548e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | class QuestionPaper:
def __init__(self, path=None):
self.questions = []
self.answers = []
self.path = path
def clean_answers(self):
# Remove unwanted patterns from answers
unwanted_patterns = [
"Time: 15 MinutesMarks: 20",
"Time: 15 Minutes Marks: 20",
"GENERAL KNOWLEDGE QUESTION PAPER WITH ANSWERS",
"GENERAL KNOWLEDGE QUESTION PAPER",
"" # Empty strings
]
# Filter out unwanted answers
cleaned_answers = []
for answer in self.answers:
if answer.strip() and answer.strip() not in unwanted_patterns:
# Also check if it doesn't match any unwanted pattern with regex
is_unwanted = False
for pattern in unwanted_patterns:
if pattern and re.search(re.escape(pattern), answer, re.IGNORECASE):
is_unwanted = True
break
if not is_unwanted:
cleaned_answers.append(answer.strip())
self.answers = cleaned_answers
def add_question(self, question_text):
self.questions.append(question_text)
def add_answer(self, answer_text):
self.answers.append(answer_text)
def to_dict(self):
return {
'questions': self.questions,
'answers': self.answers
}
def parse_question_paper_text(text):
"""
Improved parsing function that correctly identifies questions and answers
"""
lines = [line.strip() for line in text.split('\n') if line.strip()]
questions = []
answers = []
# Patterns to ignore (headers, footers, etc.)
ignore_patterns = [
r'GENERAL KNOWLEDGE QUESTION PAPER.*',
r'Time:\s*\d+\s*Minutes.*Marks:\s*\d+',
r'Time:\s*\d+\s*MinutesMarks:\s*\d+',
r'^\s*$' # Empty lines
]
# Filter out unwanted lines
filtered_lines = []
for line in lines:
should_ignore = False
for pattern in ignore_patterns:
if re.match(pattern, line, re.IGNORECASE):
should_ignore = True
break
if not should_ignore:
filtered_lines.append(line)
# Pattern to identify questions (starts with number followed by dot/parenthesis)
question_pattern = r'^\d+\s*[.)]\s*(.+)'
i = 0
while i < len(filtered_lines):
current_line = filtered_lines[i].strip()
# Check if current line is a question
question_match = re.match(question_pattern, current_line)
if question_match:
# This is a question
question_text = question_match.group(1).strip()
questions.append(f"{current_line}") # Keep the full question with number
# Look for the answer in the next line
if i + 1 < len(filtered_lines):
next_line = filtered_lines[i + 1].strip()
# If next line is not a question (doesn't start with number), it's likely an answer
if not re.match(question_pattern, next_line):
answers.append(next_line)
i += 2 # Skip both question and answer
else:
# Next line is also a question, so this question might not have an answer
# Or the answer might be embedded in the same line
# Try to extract answer from the question line itself if it contains common answer patterns
answers.append("") # Placeholder for missing answer
i += 1
else:
# Last line and it's a question without answer
answers.append("")
i += 1
else:
# This line doesn't match question pattern, skip it or try to pair it with previous question
if len(questions) > len(answers):
# We have more questions than answers, this might be an answer
answers.append(current_line)
i += 1
# Ensure we have equal number of questions and answers
while len(answers) < len(questions):
answers.append("")
while len(questions) < len(answers):
questions.append(f"Question {len(questions) + 1}")
return questions, answers
@app.route('/process_question_paper', methods=['POST'])
def process_question_paper():
global last_processed_question_paper_object
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
question_paper = QuestionPaper()
try:
# Create Images directory if it doesn't exist
images_dir = os.path.join(app.root_path, 'Images')
os.makedirs(images_dir, exist_ok=True)
if file.filename.lower().endswith('.pdf'):
question_paper_filename = "question_paper.pdf"
question_paper_path = os.path.join(images_dir, question_paper_filename)
file.save(question_paper_path)
# Initialize the global object with the path
question_paper.path = question_paper_path
# For PDF processing
images_from_pdf = convert_from_path(question_paper_path, poppler_path=r'C:\Program Files\poppler\Library\bin')
all_text = ""
for page_image in images_from_pdf:
text = pytesseract.image_to_string(page_image)
all_text += text + "\n"
# Use improved parsing
questions, answers = parse_question_paper_text(all_text)
question_paper.questions = questions
question_paper.answers = answers
else:
# Process as image
question_paper_filename = "question_paper.png"
question_paper_path = os.path.join(images_dir, question_paper_filename)
file.save(question_paper_path)
question_paper.path = question_paper_path
image = Image.open(question_paper_path)
text = pytesseract.image_to_string(image)
# Use improved parsing
questions, answers = parse_question_paper_text(text)
question_paper.questions = questions
question_paper.answers = answers
# Clean the answers (remove any remaining unwanted patterns)
question_paper.clean_answers()
# Store the processed question paper globally
last_processed_question_paper_object = question_paper
return jsonify(question_paper.to_dict())
except Exception as e:
return jsonify({'error': str(e)}), 500 |