pratikshahp commited on
Commit
eefadc3
·
verified ·
1 Parent(s): 2c6cd54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -50
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
 
5
  # Load model directly
6
  model_name = "openai-community/gpt2"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
 
10
  # Function to extract text from PDF
11
  def extract_text_from_pdf(pdf_file):
@@ -16,54 +16,28 @@ def extract_text_from_pdf(pdf_file):
16
  text += page.get_text()
17
  return text
18
 
19
- # Function to split text into chunks
20
- def split_text(text, chunk_size=500):
21
- words = text.split()
22
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
23
- return chunks
24
-
25
- # Function to generate MCQs using the model
26
- def generate_mcqs(text_chunks, num_questions=5):
27
- if not text_chunks:
28
- return ["No text extracted from the PDF. Unable to generate MCQs."]
29
 
30
- # Create the question generation pipeline
31
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
32
- mcqs = []
33
-
34
- for chunk in text_chunks:
35
- input_text = f"Based on the following text, generate a multiple-choice question along with four plausible options and mark the correct answer:\n\n{chunk}\n\nQuestion:"
36
- generated = generator(input_text, max_length=400, num_return_sequences=1)
37
- generated_text = generated[0]["generated_text"]
38
-
39
- # Extract question and options
40
- try:
41
- question_part = generated_text.split("Question:")[1].strip()
42
- question = question_part.split("Options:")[0].strip()
43
- options_part = question_part.split("Options:")[1].strip()
44
- options = options_part.split("\n")
45
-
46
- # Ensure four options
47
- if len(options) < 4:
48
- continue
49
-
50
- options = [f"Option {chr(65 + i)}: {option.strip()}" for i, option in enumerate(options[:4])]
51
- correct_answer = options[0] # Placeholder for correct answer identification logic
52
-
53
- mcq_formatted = f"Q: {question}\n{options[0]}\n{options[1]}\n{options[2]}\n{options[3]}\nCorrect Answer: {correct_answer}"
54
- mcqs.append(mcq_formatted)
55
- except:
56
- continue
57
 
58
- if len(mcqs) >= num_questions:
59
- break
 
 
 
 
 
60
 
61
- return mcqs
62
 
63
  # Streamlit app interface
64
- st.title("PDF to MCQ Generator")
65
 
66
- uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
67
 
68
  if uploaded_file is not None:
69
  st.write("Extracting text from the PDF...")
@@ -71,11 +45,10 @@ if uploaded_file is not None:
71
  st.write("Text extracted successfully!")
72
  st.write("Extracted Text:", text)
73
 
74
- st.write("Generating MCQs...")
75
- num_questions = st.number_input("Number of MCQs to generate", min_value=1, max_value=20, value=5, step=1, format="%d")
76
- text_chunks = split_text(text)
77
- mcqs = generate_mcqs(text_chunks, num_questions)
78
 
79
- st.write("Generated MCQs:")
80
- for idx, mcq in enumerate(mcqs):
81
- st.write(f"{idx+1}. {mcq}")
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
 
5
  # Load model directly
6
  model_name = "openai-community/gpt2"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForCausalLM.from_pretrained(model_name)
9
 
10
  # Function to extract text from PDF
11
  def extract_text_from_pdf(pdf_file):
 
16
  text += page.get_text()
17
  return text
18
 
19
+ # Function to generate questions using GPT-2
20
+ def generate_questions(text, num_questions=5):
21
+ if not text.strip():
22
+ return ["No text extracted from the PDF. Unable to generate questions."]
 
 
 
 
 
 
23
 
24
+ # Create the text generation pipeline
25
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ questions = []
28
+ for _ in range(num_questions):
29
+ # Generate a single question at a time
30
+ prompt = f"Generate a question based on the following text:\n{text}\n\nQuestion:"
31
+ generated = generator(prompt, max_length=200, num_return_sequences=1)
32
+ question = generated[0]["generated_text"].split("Question:")[1].strip()
33
+ questions.append(question)
34
 
35
+ return questions
36
 
37
  # Streamlit app interface
38
+ st.title("PDF to Question Generator")
39
 
40
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
41
 
42
  if uploaded_file is not None:
43
  st.write("Extracting text from the PDF...")
 
45
  st.write("Text extracted successfully!")
46
  st.write("Extracted Text:", text)
47
 
48
+ st.write("Generating questions...")
49
+ num_questions = st.number_input("Number of questions to generate", min_value=1, max_value=20, value=5, step=1, format="%d")
50
+ questions = generate_questions(text, num_questions)
 
51
 
52
+ st.write("Generated Questions:")
53
+ for idx, question in enumerate(questions):
54
+ st.write(f"{idx+1}. {question}")