mocktestgen commited on
Commit
fc9cdbf
·
verified ·
1 Parent(s): 4284a74

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ from PIL import Image
4
+ import pytesseract
5
+ import io
6
+ import re
7
+ import random
8
+ from transformers import pipeline
9
+
10
+ # Initialize question generation and summarization pipelines
11
+ qg_pipeline = pipeline("text2text-generation", model="t5-base")
12
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
+
14
+
15
+ def extract_text_from_pdf(file_bytes):
16
+ try:
17
+ text = ""
18
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
19
+ for page in pdf.pages:
20
+ page_text = page.extract_text()
21
+ if page_text:
22
+ text += page_text + "\n"
23
+ if not text.strip():
24
+ text = ocr_pdf(file_bytes)
25
+ return text
26
+ except Exception as e:
27
+ print(f"Error extracting text from PDF: {e}")
28
+ return ""
29
+
30
+
31
+ def ocr_pdf(file_bytes):
32
+ text = ""
33
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
34
+ for page in pdf.pages:
35
+ pil_image = page.to_image(resolution=300).original
36
+ page_text = pytesseract.image_to_string(pil_image)
37
+ text += page_text + "\n"
38
+ return text
39
+
40
+
41
+ def extract_text_from_image(file_bytes):
42
+ try:
43
+ image = Image.open(io.BytesIO(file_bytes))
44
+ text = pytesseract.image_to_string(image)
45
+ return text
46
+ except Exception as e:
47
+ print(f"Error extracting text from image: {e}")
48
+ return ""
49
+
50
+
51
+ def extract_text_from_txt(file_bytes):
52
+ try:
53
+ text = file_bytes.decode("utf-8")
54
+ except UnicodeDecodeError:
55
+ text = file_bytes.decode("latin-1")
56
+ return text
57
+
58
+
59
+ def clean_text(text):
60
+ text = re.sub(r'\n+', '\n', text)
61
+ text = re.sub(r'[ ]{2,}', ' ', text)
62
+ return text.strip()
63
+
64
+
65
+ def split_to_sentences(text):
66
+ sentences = re.split(r'(?<=[.?!])\s+', text)
67
+ return [s.strip() for s in sentences if s.strip()]
68
+
69
+
70
+ def highlight_answer_in_context(context, answer):
71
+ idx = context.lower().find(answer.lower())
72
+ if idx != -1:
73
+ part1 = context[:idx]
74
+ part2 = context[idx + len(answer):]
75
+ return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}"
76
+ return context
77
+
78
+
79
+ def generate_mcq(answer):
80
+ correct_answer = answer
81
+ words = correct_answer.split()
82
+ options = set()
83
+ options.add(correct_answer)
84
+ while len(options) < 4:
85
+ if len(words) > 1:
86
+ shuffled = words[:]
87
+ random.shuffle(shuffled)
88
+ option = ' '.join(shuffled)
89
+ if option.lower() != correct_answer.lower():
90
+ options.add(option)
91
+ else:
92
+ option = correct_answer + random.choice(['.', ',', '?', '!'])
93
+ options.add(option)
94
+ options = list(options)
95
+ random.shuffle(options)
96
+ correct_letter = 'ABCD'[options.index(correct_answer)]
97
+ return options, correct_letter
98
+
99
+
100
+ def generate_questions_mcq(context, num_questions):
101
+ sentences = split_to_sentences(context)
102
+ questions_structured = []
103
+ used_questions = set()
104
+ candidates = sentences[:15]
105
+ for sentence in candidates:
106
+ input_text = highlight_answer_in_context(context, sentence)
107
+ input_text_for_model = "generate question: " + input_text
108
+ question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
109
+ if question in used_questions or not question.endswith('?'):
110
+ continue
111
+ used_questions.add(question)
112
+ options, correct_letter = generate_mcq(sentence)
113
+ questions_structured.append({
114
+ "question": question,
115
+ "options": options,
116
+ "correct_letter": correct_letter,
117
+ "correct_answer": sentence,
118
+ "explanation": f"Answer explanation: {sentence}"
119
+ })
120
+ if len(questions_structured) >= num_questions:
121
+ break
122
+ if not questions_structured:
123
+ question = "What is the main topic discussed in the content?"
124
+ options = ["Option A", "Option B", "Option C", "Option D"]
125
+ questions_structured.append({
126
+ "question": question,
127
+ "options": options,
128
+ "correct_letter": "A",
129
+ "correct_answer": "Option A",
130
+ "explanation": "Fallback explanation."
131
+ })
132
+ return questions_structured
133
+
134
+
135
+ def generate_questions_subjective(context, num_questions):
136
+ sentences = split_to_sentences(context)
137
+ questions_structured = []
138
+ used_questions = set()
139
+ candidates = sentences[:20]
140
+ for sentence in candidates:
141
+ input_text = highlight_answer_in_context(context, sentence)
142
+ input_text_for_model = "generate question: " + input_text
143
+ question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
144
+ if question in used_questions or not question.endswith('?'):
145
+ continue
146
+ used_questions.add(question)
147
+ answer = summarizer(sentence, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
148
+ questions_structured.append({
149
+ "question": question,
150
+ "answer": answer
151
+ })
152
+ if len(questions_structured) >= num_questions:
153
+ break
154
+ if not questions_structured:
155
+ questions_structured.append({
156
+ "question": "Describe the main topic discussed in the content.",
157
+ "answer": "The main topic is an overview of the content provided."
158
+ })
159
+ return questions_structured
160
+
161
+
162
+ def format_mcq_output(questions):
163
+ output = ""
164
+ for idx, q in enumerate(questions, 1):
165
+ output += f"- Q{idx}: {q['question']}\n"
166
+ opts = ['A', 'B', 'C', 'D']
167
+ for opt_idx, option in enumerate(q['options']):
168
+ output += f" - {opts[opt_idx]}. {option}\n"
169
+ output += f"- Correct Answer: {q['correct_letter']}\n"
170
+ output += f"- Explanation: {q['explanation']}\n\n"
171
+ return output.strip()
172
+
173
+
174
+ def format_subjective_output(questions):
175
+ output = ""
176
+ for idx, q in enumerate(questions, 1):
177
+ output += f"- Q{idx}: {q['question']}\n"
178
+ output += f"- Suggested Answer: {q['answer']}\n\n"
179
+ return output.strip()
180
+
181
+
182
+ def main_process(file_bytes, question_type, num_questions):
183
+ if file_bytes is None:
184
+ return "Please upload a file."
185
+
186
+ # file_bytes is bytes, but filename is not directly available
187
+ # Cannot reliably detect extension without filename, so ask user to ensure proper files uploaded.
188
+ # For robust identification, you could read magic bytes or use content-type headers, but we keep it simple:
189
+ # For demonstration, assume PDFs if they start with %PDF, images if they contain image signatures, else txt
190
+
191
+ header = file_bytes[:4]
192
+ if header.startswith(b'%PDF'):
193
+ extracted_text = extract_text_from_pdf(file_bytes)
194
+ else:
195
+ try:
196
+ image = Image.open(io.BytesIO(file_bytes))
197
+ extracted_text = extract_text_from_image(file_bytes)
198
+ except Exception:
199
+ extracted_text = extract_text_from_txt(file_bytes)
200
+
201
+ extracted_text = clean_text(extracted_text)
202
+
203
+ if len(extracted_text) < 30:
204
+ return "Extracted text is too short or empty. Please check your input file."
205
+
206
+ if question_type == "MCQ":
207
+ questions = generate_questions_mcq(extracted_text, num_questions)
208
+ return format_mcq_output(questions)
209
+ else:
210
+ questions = generate_questions_subjective(extracted_text, num_questions)
211
+ return format_subjective_output(questions)
212
+
213
+
214
+ css = """
215
+ #header {
216
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
217
+ font-weight: 700;
218
+ font-size: 28px;
219
+ text-align: center;
220
+ margin-bottom: 20px;
221
+ color: #333;
222
+ }
223
+ #footer {
224
+ font-size: 12px;
225
+ color: #666;
226
+ margin-top: 30px;
227
+ text-align: center;
228
+ }
229
+ .output-area {
230
+ white-space: pre-wrap;
231
+ background-color: #f3f4f6;
232
+ padding: 15px;
233
+ border-radius: 8px;
234
+ font-family: monospace;
235
+ max-height: 450px;
236
+ overflow-y: auto;
237
+ }
238
+ .gr-button {
239
+ background-color: #4f46e5;
240
+ color: white;
241
+ font-weight: bold;
242
+ border-radius: 8px;
243
+ }
244
+ .gr-button:hover {
245
+ background-color: #4338ca;
246
+ }
247
+ """
248
+
249
+ with gr.Blocks(css=css) as demo:
250
+ gr.Markdown("<div id='header'>📚 Study Content Question Generator</div>")
251
+ with gr.Row():
252
+ file_input = gr.File(label="Upload PDF, Image, or Text file", type="binary")
253
+ with gr.Column():
254
+ question_type = gr.Radio(choices=["MCQ", "Subjective"], label="Question Type", value="MCQ")
255
+ num_questions = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Questions")
256
+ generate_btn = gr.Button("Generate Questions")
257
+ output = gr.Textbox(label="Generated Questions", lines=20, interactive=False, elem_classes="output-area")
258
+
259
+ generate_btn.click(fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output)
260
+
261
+ gr.Markdown("<div id='footer'>Made with ❤️ using Hugging Face Spaces and Transformers</div>")
262
+
263
+ if __name__ == "__main__":
264
+ demo.launch()