Spaces:

pratikshahp
/

Question-Answer-Generation-App

Build error

App Files Files Community

pratikshahp commited on Jul 9, 2024

Commit

eefadc3

verified ·

1 Parent(s): 2c6cd54

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -50

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import streamlit as st
 import fitz  # PyMuPDF
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 # Load model directly
 model_name = "openai-community/gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 # Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
@@ -16,54 +16,28 @@ def extract_text_from_pdf(pdf_file):
         text += page.get_text()
     return text
-# Function to split text into chunks
-def split_text(text, chunk_size=500):
-    words = text.split()
-    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-    return chunks
-# Function to generate MCQs using the model
-def generate_mcqs(text_chunks, num_questions=5):
-    if not text_chunks:
-        return ["No text extracted from the PDF. Unable to generate MCQs."]
-    # Create the question generation pipeline
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    mcqs = []
-    for chunk in text_chunks:
-        input_text = f"Based on the following text, generate a multiple-choice question along with four plausible options and mark the correct answer:\n\n{chunk}\n\nQuestion:"
-        generated = generator(input_text, max_length=400, num_return_sequences=1)
-        generated_text = generated[0]["generated_text"]
-        # Extract question and options
-        try:
-            question_part = generated_text.split("Question:")[1].strip()
-            question = question_part.split("Options:")[0].strip()
-            options_part = question_part.split("Options:")[1].strip()
-            options = options_part.split("\n")
-            # Ensure four options
-            if len(options) < 4:
-                continue
-            options = [f"Option {chr(65 + i)}: {option.strip()}" for i, option in enumerate(options[:4])]
-            correct_answer = options[0]  # Placeholder for correct answer identification logic
-            mcq_formatted = f"Q: {question}\n{options[0]}\n{options[1]}\n{options[2]}\n{options[3]}\nCorrect Answer: {correct_answer}"
-            mcqs.append(mcq_formatted)
-        except:
-            continue
-        if len(mcqs) >= num_questions:
-            break
-    return mcqs
 # Streamlit app interface
-st.title("PDF to MCQ Generator")
-uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
 if uploaded_file is not None:
     st.write("Extracting text from the PDF...")
@@ -71,11 +45,10 @@ if uploaded_file is not None:
     st.write("Text extracted successfully!")
     st.write("Extracted Text:", text)
-    st.write("Generating MCQs...")
-    num_questions = st.number_input("Number of MCQs to generate", min_value=1, max_value=20, value=5, step=1, format="%d")
-    text_chunks = split_text(text)
-    mcqs = generate_mcqs(text_chunks, num_questions)
-    st.write("Generated MCQs:")
-    for idx, mcq in enumerate(mcqs):
-        st.write(f"{idx+1}. {mcq}")

 import streamlit as st
 import fitz  # PyMuPDF
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 # Load model directly
 model_name = "openai-community/gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
 # Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
         text += page.get_text()
     return text
+# Function to generate questions using GPT-2
+def generate_questions(text, num_questions=5):
+    if not text.strip():
+        return ["No text extracted from the PDF. Unable to generate questions."]
+    # Create the text generation pipeline
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    questions = []
+    for _ in range(num_questions):
+        # Generate a single question at a time
+        prompt = f"Generate a question based on the following text:\n{text}\n\nQuestion:"
+        generated = generator(prompt, max_length=200, num_return_sequences=1)
+        question = generated[0]["generated_text"].split("Question:")[1].strip()
+        questions.append(question)
+    return questions
 # Streamlit app interface
+st.title("PDF to Question Generator")
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if uploaded_file is not None:
     st.write("Extracting text from the PDF...")
     st.write("Text extracted successfully!")
     st.write("Extracted Text:", text)
+    st.write("Generating questions...")
+    num_questions = st.number_input("Number of questions to generate", min_value=1, max_value=20, value=5, step=1, format="%d")
+    questions = generate_questions(text, num_questions)
+    st.write("Generated Questions:")
+    for idx, question in enumerate(questions):
+        st.write(f"{idx+1}. {question}")