Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pdfplumber | |
| from PIL import Image | |
| import pytesseract | |
| import io | |
| import re | |
| import random | |
| from transformers import pipeline | |
| # Use a stable and widely supported model for question generation | |
| qg_pipeline = pipeline("text2text-generation", model="t5-base") # standard T5 base model | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # reliable summarizer | |
| def extract_text_from_pdf(file_bytes): | |
| try: | |
| text = "" | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| if not text.strip(): | |
| text = ocr_pdf(file_bytes) | |
| return text | |
| except Exception: | |
| return "" | |
| def ocr_pdf(file_bytes): | |
| text = "" | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| pil_image = page.to_image(resolution=300).original | |
| page_text = pytesseract.image_to_string(pil_image) | |
| text += page_text + "\n" | |
| return text | |
| def extract_text_from_image(file_bytes): | |
| image = Image.open(io.BytesIO(file_bytes)) | |
| text = pytesseract.image_to_string(image) | |
| return text | |
| def extract_text_from_txt(file_bytes): | |
| try: | |
| text = file_bytes.decode("utf-8") | |
| except UnicodeDecodeError: | |
| text = file_bytes.decode("latin-1") | |
| return text | |
| def clean_text(text): | |
| text = re.sub(r'\n+', '\n', text) | |
| text = re.sub(r'[ ]{2,}', ' ', text) | |
| return text.strip() | |
| def split_to_sentences(text): | |
| sentences = re.split(r'(?<=[.?!])\s+', text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| def highlight_answer_in_context(context, answer): | |
| idx = context.lower().find(answer.lower()) | |
| if idx != -1: | |
| part1 = context[:idx] | |
| part2 = context[idx + len(answer):] | |
| return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}" | |
| else: | |
| return context | |
| def generate_mcq(answer): | |
| correct_answer = answer | |
| words = correct_answer.split() | |
| options = set() | |
| options.add(correct_answer) | |
| while len(options) < 4: | |
| if len(words) > 1: | |
| shuffled = words[:] | |
| random.shuffle(shuffled) | |
| option = ' '.join(shuffled) | |
| if option.lower() != correct_answer.lower(): | |
| options.add(option) | |
| else: | |
| option = correct_answer + random.choice(['.', ',', '?', '!']) | |
| options.add(option) | |
| options = list(options) | |
| random.shuffle(options) | |
| correct_letter = 'ABCD'[options.index(correct_answer)] | |
| return options, correct_letter | |
| def generate_questions_mcq(context, num_questions): | |
| sentences = split_to_sentences(context) | |
| questions_structured = [] | |
| used_questions = set() | |
| candidates = sentences[:15] | |
| for sentence in candidates: | |
| input_text = highlight_answer_in_context(context, sentence) | |
| # Prefix input for T5 question generation | |
| input_text_for_model = "generate question: " + input_text | |
| # Generate question | |
| question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text'] | |
| if question in used_questions or not question.endswith('?'): | |
| continue | |
| used_questions.add(question) | |
| options, correct_letter = generate_mcq(sentence) | |
| questions_structured.append({ | |
| "question": question, | |
| "options": options, | |
| "correct_letter": correct_letter, | |
| "correct_answer": sentence, | |
| "explanation": f"Answer explanation: {sentence}" | |
| }) | |
| if len(questions_structured) >= num_questions: | |
| break | |
| if not questions_structured: | |
| question = "What is the main topic discussed in the content?" | |
| options = ["Option A", "Option B", "Option C", "Option D"] | |
| questions_structured.append({ | |
| "question": question, | |
| "options": options, | |
| "correct_letter": "A", | |
| "correct_answer": "Option A", | |
| "explanation": "Fallback explanation." | |
| }) | |
| return questions_structured | |
| def generate_questions_subjective(context, num_questions): | |
| sentences = split_to_sentences(context) | |
| questions_structured = [] | |
| used_questions = set() | |
| candidates = sentences[:20] | |
| for sentence in candidates: | |
| input_text = highlight_answer_in_context(context, sentence) | |
| input_text_for_model = "generate question: " + input_text | |
| question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text'] | |
| if question in used_questions or not question.endswith('?'): | |
| continue | |
| used_questions.add(question) | |
| answer = summarizer(sentence, max_length=50, min_length=10, do_sample=False)[0]['summary_text'] | |
| questions_structured.append({ | |
| "question": question, | |
| "answer": answer | |
| }) | |
| if len(questions_structured) >= num_questions: | |
| break | |
| if not questions_structured: | |
| questions_structured.append({ | |
| "question": "Describe the main topic discussed in the content.", | |
| "answer": "The main topic is an overview of the content provided." | |
| }) | |
| return questions_structured | |
| def format_mcq_output(questions): | |
| output = "" | |
| for idx, q in enumerate(questions, 1): | |
| output += f"- Q{idx}: {q['question']}\n" | |
| opts = ['A', 'B', 'C', 'D'] | |
| for opt_idx, option in enumerate(q['options']): | |
| output += f" - {opts[opt_idx]}. {option}\n" | |
| output += f"- Correct Answer: {q['correct_letter']}\n" | |
| output += f"- Explanation: {q['explanation']}\n\n" | |
| return output.strip() | |
| def format_subjective_output(questions): | |
| output = "" | |
| for idx, q in enumerate(questions, 1): | |
| output += f"- Q{idx}: {q['question']}\n" | |
| output += f"- Suggested Answer: {q['answer']}\n\n" | |
| return output.strip() | |
| def main_process(file, question_type, num_questions): | |
| if not file: | |
| return "Please upload a file." | |
| file_bytes = file.read() | |
| fname = file.name.lower() | |
| extracted_text = "" | |
| if fname.endswith(".pdf"): | |
| extracted_text = extract_text_from_pdf(file_bytes) | |
| elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")): | |
| extracted_text = extract_text_from_image(file_bytes) | |
| elif fname.endswith(".txt"): | |
| extracted_text = extract_text_from_txt(file_bytes) | |
| else: | |
| return "Unsupported file type. Please upload PDF, Image, or TXT." | |
| extracted_text = clean_text(extracted_text) | |
| if len(extracted_text) < 30: | |
| return "Extracted text is too short or empty. Please check your input file." | |
| if question_type == "MCQ": | |
| questions = generate_questions_mcq(extracted_text, num_questions) | |
| return format_mcq_output(questions) | |
| else: | |
| questions = generate_questions_subjective(extracted_text, num_questions) | |
| return format_subjective_output(questions) | |
| css = """ | |
| #header { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| font-weight: 700; | |
| font-size: 28px; | |
| text-align: center; | |
| margin-bottom: 20px; | |
| color: #333; | |
| } | |
| #footer { | |
| font-size: 12px; | |
| color: #666; | |
| margin-top: 30px; | |
| text-align: center; | |
| } | |
| .output-area { | |
| white-space: pre-wrap; | |
| background-color: #f3f4f6; | |
| padding: 15px; | |
| border-radius: 8px; | |
| font-family: monospace; | |
| max-height: 450px; | |
| overflow-y: auto; | |
| } | |
| .gr-button { | |
| background-color: #4f46e5; | |
| color: white; | |
| font-weight: bold; | |
| border-radius: 8px; | |
| } | |
| .gr-button:hover { | |
| background-color: #4338ca; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| gr.Markdown("<div id='header'>📚 Study Content Question Generator</div>") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload PDF, Image, or Text file", type="file") | |
| with gr.Column(): | |
| question_type = gr.Radio( | |
| choices=["MCQ", "Subjective"], label="Question Type", value="MCQ" | |
| ) | |
| num_questions = gr.Slider( | |
| 1, 10, value=5, step=1, label="Number of Questions" | |
| ) | |
| generate_btn = gr.Button("Generate Questions") | |
| output = gr.Textbox( | |
| label="Generated Questions", | |
| lines=20, | |
| interactive=False, | |
| elem_classes="output-area", | |
| ) | |
| generate_btn.click( | |
| fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output | |
| ) | |
| gr.Markdown("<div id='footer'>Made with ❤️ using Hugging Face Spaces and Transformers</div>") | |
| if __name__ == "__main__": | |
| demo.launch() | |