import gradio as gr import pdfplumber from PIL import Image import pytesseract import io import re import random from transformers import pipeline # Use a stable and widely supported model for question generation qg_pipeline = pipeline("text2text-generation", model="t5-base") # standard T5 base model summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # reliable summarizer def extract_text_from_pdf(file_bytes): try: text = "" with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if not text.strip(): text = ocr_pdf(file_bytes) return text except Exception: return "" def ocr_pdf(file_bytes): text = "" with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: for page in pdf.pages: pil_image = page.to_image(resolution=300).original page_text = pytesseract.image_to_string(pil_image) text += page_text + "\n" return text def extract_text_from_image(file_bytes): image = Image.open(io.BytesIO(file_bytes)) text = pytesseract.image_to_string(image) return text def extract_text_from_txt(file_bytes): try: text = file_bytes.decode("utf-8") except UnicodeDecodeError: text = file_bytes.decode("latin-1") return text def clean_text(text): text = re.sub(r'\n+', '\n', text) text = re.sub(r'[ ]{2,}', ' ', text) return text.strip() def split_to_sentences(text): sentences = re.split(r'(?<=[.?!])\s+', text) return [s.strip() for s in sentences if s.strip()] def highlight_answer_in_context(context, answer): idx = context.lower().find(answer.lower()) if idx != -1: part1 = context[:idx] part2 = context[idx + len(answer):] return f"{part1.strip()} {answer.strip()} {part2.strip()}" else: return context def generate_mcq(answer): correct_answer = answer words = correct_answer.split() options = set() options.add(correct_answer) while len(options) < 4: if len(words) > 1: shuffled = words[:] random.shuffle(shuffled) option = ' '.join(shuffled) if option.lower() != correct_answer.lower(): options.add(option) else: option = correct_answer + random.choice(['.', ',', '?', '!']) options.add(option) options = list(options) random.shuffle(options) correct_letter = 'ABCD'[options.index(correct_answer)] return options, correct_letter def generate_questions_mcq(context, num_questions): sentences = split_to_sentences(context) questions_structured = [] used_questions = set() candidates = sentences[:15] for sentence in candidates: input_text = highlight_answer_in_context(context, sentence) input_text_for_model = "generate question: " + input_text question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text'] if question in used_questions or not question.endswith('?'): continue used_questions.add(question) options, correct_letter = generate_mcq(sentence) questions_structured.append({ "question": question, "options": options, "correct_letter": correct_letter, "correct_answer": sentence, "explanation": f"Answer explanation: {sentence}" }) if len(questions_structured) >= num_questions: break if not questions_structured: question = "What is the main topic discussed in the content?" options = ["Option A", "Option B", "Option C", "Option D"] questions_structured.append({ "question": question, "options": options, "correct_letter": "A", "correct_answer": "Option A", "explanation": "Fallback explanation." }) return questions_structured def generate_questions_subjective(context, num_questions): sentences = split_to_sentences(context) questions_structured = [] used_questions = set() candidates = sentences[:20] for sentence in candidates: input_text = highlight_answer_in_context(context, sentence) input_text_for_model = "generate question: " + input_text question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text'] if question in used_questions or not question.endswith('?'): continue used_questions.add(question) answer = summarizer(sentence, max_length=50, min_length=10, do_sample=False)[0]['summary_text'] questions_structured.append({ "question": question, "answer": answer }) if len(questions_structured) >= num_questions: break if not questions_structured: questions_structured.append({ "question": "Describe the main topic discussed in the content.", "answer": "The main topic is an overview of the content provided." }) return questions_structured def format_mcq_output(questions): output = "" for idx, q in enumerate(questions, 1): output += f"- Q{idx}: {q['question']}\n" opts = ['A', 'B', 'C', 'D'] for opt_idx, option in enumerate(q['options']): output += f" - {opts[opt_idx]}. {option}\n" output += f"- Correct Answer: {q['correct_letter']}\n" output += f"- Explanation: {q['explanation']}\n\n" return output.strip() def format_subjective_output(questions): output = "" for idx, q in enumerate(questions, 1): output += f"- Q{idx}: {q['question']}\n" output += f"- Suggested Answer: {q['answer']}\n\n" return output.strip() def main_process(file, question_type, num_questions): if not file: return "Please upload a file." file_bytes = file.read() fname = file.name.lower() extracted_text = "" if fname.endswith(".pdf"): extracted_text = extract_text_from_pdf(file_bytes) elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")): extracted_text = extract_text_from_image(file_bytes) elif fname.endswith(".txt"): extracted_text = extract_text_from_txt(file_bytes) else: return "Unsupported file type. Please upload PDF, Image, or TXT." extracted_text = clean_text(extracted_text) if len(extracted_text) < 30: return "Extracted text is too short or empty. Please check your input file." if question_type == "MCQ": questions = generate_questions_mcq(extracted_text, num_questions) return format_mcq_output(questions) else: questions = generate_questions_subjective(extracted_text, num_questions) return format_subjective_output(questions) css = """ #header { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; font-weight: 700; font-size: 28px; text-align: center; margin-bottom: 20px; color: #333; } #footer { font-size: 12px; color: #666; margin-top: 30px; text-align: center; } .output-area { white-space: pre-wrap; background-color: #f3f4f6; padding: 15px; border-radius: 8px; font-family: monospace; max-height: 450px; overflow-y: auto; } .gr-button { background-color: #4f46e5; color: white; font-weight: bold; border-radius: 8px; } .gr-button:hover { background-color: #4338ca; } """ with gr.Blocks(css=css) as demo: gr.Markdown("") with gr.Row(): file_input = gr.File(label="Upload PDF, Image, or Text file", type="file") with gr.Column(): question_type = gr.Radio(choices=["MCQ", "Subjective"], label="Question Type", value="MCQ") num_questions = gr.Slider(1, 10, value=5, step=1, label="Number of Questions") generate_btn = gr.Button("Generate Questions") output = gr.Textbox(label="Generated Questions", lines=20, interactive=False, elem_classes="output-area") generate_btn.click(fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output) gr.Markdown("") if __name__ == "__main__": demo.launch()