pratikshahp's picture
Update app.py
69d7207 verified
import streamlit as st
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# Load model directly
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
# Function to generate questions using GPT-2
def generate_questions(text, num_questions=5):
if not text.strip():
return ["No text extracted from the PDF. Unable to generate questions."]
# Create the text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
questions = []
for _ in range(num_questions):
# Generate a single question at a time
prompt = f"Generate a question based on the following text:\n{text}\n\nQuestion:"
generated = generator(prompt, max_length=500, num_return_sequences=1)
question = generated[0]["generated_text"].split("Question:")[1].strip()
questions.append(question)
return questions
# Streamlit app interface
st.title("PDF to Question Generator")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
st.write("Extracting text from the PDF...")
text = extract_text_from_pdf(uploaded_file)
st.write("Text extracted successfully!")
st.write("Extracted Text:", text)
st.write("Generating questions...")
num_questions = st.number_input("Number of questions to generate", min_value=1, max_value=20, value=5, step=1, format="%d")
questions = generate_questions(text, num_questions)
st.write("Generated Questions:")
for idx, question in enumerate(questions):
st.write(f"{idx+1}. {question}")