Spaces:

mocktestgen
/

Hritwik

Runtime error

File size: 8,662 Bytes
import gradio as gr
import pdfplumber
from PIL import Image
import pytesseract
import io
import re
import random
from transformers import pipeline


# Use a stable and widely supported model for question generation
qg_pipeline = pipeline("text2text-generation", model="t5-base")  # standard T5 base model

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # reliable summarizer


def extract_text_from_pdf(file_bytes):
    try:
        text = ""
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        if not text.strip():
            text = ocr_pdf(file_bytes)
        return text
    except Exception:
        return ""


def ocr_pdf(file_bytes):
    text = ""
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for page in pdf.pages:
            pil_image = page.to_image(resolution=300).original
            page_text = pytesseract.image_to_string(pil_image)
            text += page_text + "\n"
    return text


def extract_text_from_image(file_bytes):
    image = Image.open(io.BytesIO(file_bytes))
    text = pytesseract.image_to_string(image)
    return text


def extract_text_from_txt(file_bytes):
    try:
        text = file_bytes.decode("utf-8")
    except UnicodeDecodeError:
        text = file_bytes.decode("latin-1")
    return text


def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ ]{2,}', ' ', text)
    return text.strip()


def split_to_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return [s.strip() for s in sentences if s.strip()]


def highlight_answer_in_context(context, answer):
    idx = context.lower().find(answer.lower())
    if idx != -1:
        part1 = context[:idx]
        part2 = context[idx + len(answer):]
        return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}"
    else:
        return context


def generate_mcq(answer):
    correct_answer = answer
    words = correct_answer.split()
    options = set()
    options.add(correct_answer)
    while len(options) < 4:
        if len(words) > 1:
            shuffled = words[:]
            random.shuffle(shuffled)
            option = ' '.join(shuffled)
            if option.lower() != correct_answer.lower():
                options.add(option)
        else:
            option = correct_answer + random.choice(['.', ',', '?', '!'])
            options.add(option)
    options = list(options)
    random.shuffle(options)
    correct_letter = 'ABCD'[options.index(correct_answer)]
    return options, correct_letter


def generate_questions_mcq(context, num_questions):
    sentences = split_to_sentences(context)
    questions_structured = []
    used_questions = set()
    candidates = sentences[:15]
    for sentence in candidates:
        input_text = highlight_answer_in_context(context, sentence)
        # Prefix input for T5 question generation
        input_text_for_model = "generate question: " + input_text
        # Generate question
        question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
        if question in used_questions or not question.endswith('?'):
            continue
        used_questions.add(question)
        options, correct_letter = generate_mcq(sentence)
        questions_structured.append({
            "question": question,
            "options": options,
            "correct_letter": correct_letter,
            "correct_answer": sentence,
            "explanation": f"Answer explanation: {sentence}"
        })
        if len(questions_structured) >= num_questions:
            break
    if not questions_structured:
        question = "What is the main topic discussed in the content?"
        options = ["Option A", "Option B", "Option C", "Option D"]
        questions_structured.append({
            "question": question,
            "options": options,
            "correct_letter": "A",
            "correct_answer": "Option A",
            "explanation": "Fallback explanation."
        })
    return questions_structured


def generate_questions_subjective(context, num_questions):
    sentences = split_to_sentences(context)
    questions_structured = []
    used_questions = set()
    candidates = sentences[:20]
    for sentence in candidates:
        input_text = highlight_answer_in_context(context, sentence)
        input_text_for_model = "generate question: " + input_text
        question = qg_pipeline(input_text_for_model, max_length=64, do_sample=False)[0]['generated_text']
        if question in used_questions or not question.endswith('?'):
            continue
        used_questions.add(question)
        answer = summarizer(sentence, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
        questions_structured.append({
            "question": question,
            "answer": answer
        })
        if len(questions_structured) >= num_questions:
            break
    if not questions_structured:
        questions_structured.append({
            "question": "Describe the main topic discussed in the content.",
            "answer": "The main topic is an overview of the content provided."
        })
    return questions_structured


def format_mcq_output(questions):
    output = ""
    for idx, q in enumerate(questions, 1):
        output += f"- Q{idx}: {q['question']}\n"
        opts = ['A', 'B', 'C', 'D']
        for opt_idx, option in enumerate(q['options']):
            output += f"  - {opts[opt_idx]}. {option}\n"
        output += f"- Correct Answer: {q['correct_letter']}\n"
        output += f"- Explanation: {q['explanation']}\n\n"
    return output.strip()


def format_subjective_output(questions):
    output = ""
    for idx, q in enumerate(questions, 1):
        output += f"- Q{idx}: {q['question']}\n"
        output += f"- Suggested Answer: {q['answer']}\n\n"
    return output.strip()


def main_process(file, question_type, num_questions):
    if not file:
        return "Please upload a file."

    file_bytes = file.read()
    fname = file.name.lower()
    extracted_text = ""

    if fname.endswith(".pdf"):
        extracted_text = extract_text_from_pdf(file_bytes)
    elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
        extracted_text = extract_text_from_image(file_bytes)
    elif fname.endswith(".txt"):
        extracted_text = extract_text_from_txt(file_bytes)
    else:
        return "Unsupported file type. Please upload PDF, Image, or TXT."

    extracted_text = clean_text(extracted_text)

    if len(extracted_text) < 30:
        return "Extracted text is too short or empty. Please check your input file."

    if question_type == "MCQ":
        questions = generate_questions_mcq(extracted_text, num_questions)
        return format_mcq_output(questions)
    else:
        questions = generate_questions_subjective(extracted_text, num_questions)
        return format_subjective_output(questions)


css = """
#header {
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  font-weight: 700;
  font-size: 28px;
  text-align: center;
  margin-bottom: 20px;
  color: #333;
}
#footer {
  font-size: 12px;
  color: #666;
  margin-top: 30px;
  text-align: center;
}
.output-area {
  white-space: pre-wrap;
  background-color: #f3f4f6;
  padding: 15px;
  border-radius: 8px;
  font-family: monospace;
  max-height: 450px;
  overflow-y: auto;
}
.gr-button {
  background-color: #4f46e5;
  color: white;
  font-weight: bold;
  border-radius: 8px;
}
.gr-button:hover {
  background-color: #4338ca;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("<div id='header'>📚 Study Content Question Generator</div>")
    with gr.Row():
        file_input = gr.File(label="Upload PDF, Image, or Text file", type="file")
        with gr.Column():
            question_type = gr.Radio(
                choices=["MCQ", "Subjective"], label="Question Type", value="MCQ"
            )
            num_questions = gr.Slider(
                1, 10, value=5, step=1, label="Number of Questions"
            )
            generate_btn = gr.Button("Generate Questions")
    output = gr.Textbox(
        label="Generated Questions",
        lines=20,
        interactive=False,
        elem_classes="output-area",
    )

    generate_btn.click(
        fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output
    )

    gr.Markdown("<div id='footer'>Made with ❤️ using Hugging Face Spaces and Transformers</div>")


if __name__ == "__main__":
    demo.launch()