mocktestgen commited on
Commit
2512970
·
verified ·
1 Parent(s): 0b20bc8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -0
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ from PIL import Image
4
+ import pytesseract
5
+ import io
6
+ import re
7
+ import random
8
+ from transformers import pipeline
9
+
10
+
11
+ # Use a stable and widely supported model for question generation
12
+ qg_pipeline = pipeline("text2text-generation", model="t5-base") # standard T5 base model
13
+
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # reliable summarizer
15
+
16
+
17
+ def extract_text_from_pdf(file_bytes):
18
+ try:
19
+ text = ""
20
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
21
+ for page in pdf.pages:
22
+ page_text = page.extract_text()
23
+ if page_text:
24
+ text += page_text + "\n"
25
+ if not text.strip():
26
+ text = ocr_pdf(file_bytes)
27
+ return text
28
+ except Exception:
29
+ return ""
30
+
31
+
32
+ def ocr_pdf(file_bytes):
33
+ text = ""
34
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
35
+ for page in pdf.pages:
36
+ pil_image = page.to_image(resolution=300).original
37
+ page_text = pytesseract.image_to_string(pil_image)
38
+ text += page_text + "\n"
39
+ return text
40
+
41
+
42
+ def extract_text_from_image(file_bytes):
43
+ image = Image.open(io.BytesIO(file_bytes))
44
+ text = pytesseract.image_to_string(image)
45
+ return text
46
+
47
+
48
+ def extract_text_from_txt(file_bytes):
49
+ try:
50
+ text = file_bytes.decode("utf-8")
51
+ except UnicodeDecodeError:
52
+ text = file_bytes.decode("latin-1")
53
+ return text
54
+
55
+
56
+ def clean_text(text):
57
+ text = re.sub(r'\n+', '\n', text)
58
+ text = re.sub(r'[ ]{2,}', ' ', text)
59
+ return text.strip()
60
+
61
+
62
+ def split_to_sentences(text):
63
+ sentences = re.split(r'(?<=[.?!])\s+', text)
64
+ return [s.strip() for s in sentences if s.strip()]
65
+
66
+
67
+ def highlight_answer_in_context(context, answer):
68
+ idx = context.lower().find(answer.lower())
69
+ if idx != -1:
70
+ part1 = context[:idx]
71
+ part2 = context[idx + len(answer):]
72
+ return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}"
73
+ else:
74
+ return context
75
+
76
+
77
+ def generate_mcq(answer):
78
+ correct_answer = answer
79
+ words = correct_answer.split()
80
+ options = set()
81
+ options.add(correct_answer)
82
+ while len(options) < 4:
83
+ if len(words) > 1:
84
+ shuffled = words[:]
85
+ random.shuffle(shuffled)
86
+ option = ' '.join(shuffled)
87
+ if option.lower() != correct_answer.lower():
88
+ options.add(option)
89
+ else:
90
+ option = correct_answer + random.choice(['.', ',', '?', '!'])
91
+ options.add(option)
92
+ options = list(options)
93
+ random.shuffle(options)
94
+ correct_letter = 'ABCD'[options.index(correct_answer)]
95
+ return options, correct_letter
96
+
97
+
98
+ def generate_questions_mcq(context, num_questions):
99
+ sentences = split_to_sentences(context)
100
+ questions_structured = []
101
+ used_questions = set()
102
+ candidates = sentences[:15]
103
+ for sentence in candidates:
104
+ input_text = highlight_answer_in_context(context, sentence)
105
+ # Prefix input for T5 question generation
106
+ input_text_for_model = "generate question: " + input_text
107
+ # Generate question
108
+ question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
109
+ if question in used_questions or not question.endswith('?'):
110
+ continue
111
+ used_questions.add(question)
112
+ options, correct_letter = generate_mcq(sentence)
113
+ questions_structured.append({
114
+ "question": question,
115
+ "options": options,
116
+ "correct_letter": correct_letter,
117
+ "correct_answer": sentence,
118
+ "explanation": f"Answer explanation: {sentence}"
119
+ })
120
+ if len(questions_structured) >= num_questions:
121
+ break
122
+ if not questions_structured:
123
+ question = "What is the main topic discussed in the content?"
124
+ options = ["Option A", "Option B", "Option C", "Option D"]
125
+ questions_structured.append({
126
+ "question": question,
127
+ "options": options,
128
+ "correct_letter": "A",
129
+ "correct_answer": "Option A",
130
+ "explanation": "Fallback explanation."
131
+ })
132
+ return questions_structured
133
+
134
+
135
+ def generate_questions_subjective(context, num_questions):
136
+ sentences = split_to_sentences(context)
137
+ questions_structured = []
138
+ used_questions = set()
139
+ candidates = sentences[:20]
140
+ for sentence in candidates:
141
+ input_text = highlight_answer_in_context(context, sentence)
142
+ input_text_for_model = "generate question: " + input_text
143
+ question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
144
+ if question in used_questions or not question.endswith('?'):
145
+ continue
146
+ used_questions.add(question)
147
+ answer = summarizer(sentence, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
148
+ questions_structured.append({
149
+ "question": question,
150
+ "answer": answer
151
+ })
152
+ if len(questions_structured) >= num_questions:
153
+ break
154
+ if not questions_structured:
155
+ questions_structured.append({
156
+ "question": "Describe the main topic discussed in the content.",
157
+ "answer": "The main topic is an overview of the content provided."
158
+ })
159
+ return questions_structured
160
+
161
+
162
+ def format_mcq_output(questions):
163
+ output = ""
164
+ for idx, q in enumerate(questions, 1):
165
+ output += f"- Q{idx}: {q['question']}\n"
166
+ opts = ['A', 'B', 'C', 'D']
167
+ for opt_idx, option in enumerate(q['options']):
168
+ output += f" - {opts[opt_idx]}. {option}\n"
169
+ output += f"- Correct Answer: {q['correct_letter']}\n"
170
+ output += f"- Explanation: {q['explanation']}\n\n"
171
+ return output.strip()
172
+
173
+
174
+ def format_subjective_output(questions):
175
+ output = ""
176
+ for idx, q in enumerate(questions, 1):
177
+ output += f"- Q{idx}: {q['question']}\n"
178
+ output += f"- Suggested Answer: {q['answer']}\n\n"
179
+ return output.strip()
180
+
181
+
182
+ def main_process(file, question_type, num_questions):
183
+ if not file:
184
+ return "Please upload a file."
185
+
186
+ file_bytes = file.read()
187
+ fname = file.name.lower()
188
+ extracted_text = ""
189
+
190
+ if fname.endswith(".pdf"):
191
+ extracted_text = extract_text_from_pdf(file_bytes)
192
+ elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
193
+ extracted_text = extract_text_from_image(file_bytes)
194
+ elif fname.endswith(".txt"):
195
+ extracted_text = extract_text_from_txt(file_bytes)
196
+ else:
197
+ return "Unsupported file type. Please upload PDF, Image, or TXT."
198
+
199
+ extracted_text = clean_text(extracted_text)
200
+
201
+ if len(extracted_text) < 30:
202
+ return "Extracted text is too short or empty. Please check your input file."
203
+
204
+ if question_type == "MCQ":
205
+ questions = generate_questions_mcq(extracted_text, num_questions)
206
+ return format_mcq_output(questions)
207
+ else:
208
+ questions = generate_questions_subjective(extracted_text, num_questions)
209
+ return format_subjective_output(questions)
210
+
211
+
212
+ css = """
213
+ #header {
214
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
215
+ font-weight: 700;
216
+ font-size: 28px;
217
+ text-align: center;
218
+ margin-bottom: 20px;
219
+ color: #333;
220
+ }
221
+ #footer {
222
+ font-size: 12px;
223
+ color: #666;
224
+ margin-top: 30px;
225
+ text-align: center;
226
+ }
227
+ .output-area {
228
+ white-space: pre-wrap;
229
+ background-color: #f3f4f6;
230
+ padding: 15px;
231
+ border-radius: 8px;
232
+ font-family: monospace;
233
+ max-height: 450px;
234
+ overflow-y: auto;
235
+ }
236
+ .gr-button {
237
+ background-color: #4f46e5;
238
+ color: white;
239
+ font-weight: bold;
240
+ border-radius: 8px;
241
+ }
242
+ .gr-button:hover {
243
+ background-color: #4338ca;
244
+ }
245
+ """
246
+
247
+ with gr.Blocks(css=css) as demo:
248
+ gr.Markdown("<div id='header'>📚 Study Content Question Generator</div>")
249
+ with gr.Row():
250
+ file_input = gr.File(label="Upload PDF, Image, or Text file", type="file")
251
+ with gr.Column():
252
+ question_type = gr.Radio(
253
+ choices=["MCQ", "Subjective"], label="Question Type", value="MCQ"
254
+ )
255
+ num_questions = gr.Slider(
256
+ 1, 10, value=5, step=1, label="Number of Questions"
257
+ )
258
+ generate_btn = gr.Button("Generate Questions")
259
+ output = gr.Textbox(
260
+ label="Generated Questions",
261
+ lines=20,
262
+ interactive=False,
263
+ elem_classes="output-area",
264
+ )
265
+
266
+ generate_btn.click(
267
+ fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output
268
+ )
269
+
270
+ gr.Markdown("<div id='footer'>Made with ❤️ using Hugging Face Spaces and Transformers</div>")
271
+
272
+
273
+ if __name__ == "__main__":
274
+ demo.launch()