pdf-to-mcqs / app.py
Ali-Raza-167's picture
Update app.py
f7c8822 verified
import gradio as gr
import os
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
import google.generativeai as genai
import re
import tempfile
from typing import List, Tuple
class PDFToMCQGenerator:
def __init__(self):
self.model = None
self.configured = False
def configure_gemini(self, api_key: str):
"""Configure Gemini with API key"""
try:
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel('gemini-pro')
self.configured = True
return "βœ… Gemini configured successfully!"
except Exception as e:
return f"❌ Error configuring Gemini: {str(e)}"
def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
"""Extract text from PDF using OCR"""
try:
pages = convert_from_path(pdf_path)
page_texts = []
for page in pages:
text = pytesseract.image_to_string(page)
page_texts.append(text)
return page_texts
except Exception as e:
raise Exception(f"Error extracting text from PDF: {str(e)}")
def split_into_statements(self, page_texts: List[str]) -> List[str]:
"""Split text into individual statements"""
all_statements = []
for page_text in page_texts:
statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()]
all_statements.extend(statements)
return all_statements
def batch_statements(self, statements: List[str], batch_size: int = 5) -> List[List[str]]:
"""Batch statements into groups"""
return [statements[i:i + batch_size] for i in range(0, len(statements), batch_size)]
def generate_mcqs(self, text_block: str) -> List[List[str]]:
"""Generate MCQs from text using Gemini"""
if not self.configured:
raise Exception("Gemini not configured. Please provide API key first.")
prompt = f"""
Generate exactly 5 MCQs from the following statements.
Each question must have:
- Clear, concise Question
- 4 Options (A-D) with only one correct answer
- Correct Answer (ONLY the letter A, B, C, or D β€” no text)
Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
Text:
{text_block}
Example format:
"What is the capital of France?","Paris","London","Berlin","Madrid","A"
"Which planet is known as the Red Planet?","Earth","Mars","Jupiter","Venus","B"
Important: Return ONLY the CSV data, no additional text.
"""
try:
response = self.model.generate_content(prompt)
mcq_data = []
for line in response.text.strip().split('\n'):
if line.strip() and not line.startswith('```'):
parts = line.split(',')
if len(parts) == 6:
# Clean each part
cleaned_parts = [part.strip().strip('"') for part in parts]
mcq_data.append(cleaned_parts)
return mcq_data
except Exception as e:
raise Exception(f"Error generating MCQs: {str(e)}")
def process_pdf(self, pdf_file, api_key: str, batch_size: int = 5) -> Tuple[pd.DataFrame, str]:
"""Main processing function"""
# Configure Gemini
config_status = self.configure_gemini(api_key)
if not self.configured:
return None, config_status
try:
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(pdf_file)
pdf_path = tmp_file.name
# Extract text
page_texts = self.extract_text_from_pdf(pdf_path)
statements = self.split_into_statements(page_texts)
if len(statements) == 0:
return None, "❌ No text could be extracted from the PDF. Please check if the PDF contains readable text."
# Batch statements
batches = self.batch_statements(statements, batch_size)
# Generate MCQs
all_mcqs = []
successful_batches = 0
for i, batch in enumerate(batches, 1):
try:
text_block = ". ".join(batch)
mcqs = self.generate_mcqs(text_block)
all_mcqs.extend(mcqs)
successful_batches += 1
except Exception as e:
print(f"Batch {i} failed: {str(e)}")
continue
# Clean up temporary file
os.unlink(pdf_path)
if len(all_mcqs) == 0:
return None, "❌ No MCQs could be generated. Please check your PDF content and try again."
# Create DataFrame
df = pd.DataFrame(all_mcqs, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
status_msg = f"βœ… Successfully processed {successful_batches} batches and generated {len(all_mcqs)} MCQs!"
return df, status_msg
except Exception as e:
# Clean up temporary file if it exists
if 'pdf_path' in locals():
try:
os.unlink(pdf_path)
except:
pass
return None, f"❌ Error processing PDF: {str(e)}"
# Create generator instance
generator = PDFToMCQGenerator()
def process_pdf_interface(pdf_file, api_key, batch_size=5):
"""Gradio interface function"""
if pdf_file is None:
return None, "❌ Please upload a PDF file."
if not api_key.strip():
return None, "❌ Please enter your Gemini API key."
try:
batch_size = int(batch_size)
if batch_size < 1 or batch_size > 10:
return None, "❌ Batch size must be between 1 and 10."
except ValueError:
return None, "❌ Batch size must be a number."
df, status = generator.process_pdf(pdf_file, api_key, batch_size)
if df is not None:
# Return both the DataFrame and status message
return df, status
else:
return None, status
# Create Gradio interface
with gr.Blocks(title="PPSC PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ“š PPSC PDF to MCQ Generator")
gr.Markdown("Convert PDF content into multiple-choice questions using Google Gemini")
with gr.Row():
with gr.Column():
api_key = gr.Textbox(
label="Gemini API Key",
type="password",
placeholder="Enter your Google Gemini API key...",
info="Get your API key from: https://aistudio.google.com/app/apikey"
)
pdf_file = gr.File(
label="Upload PDF File",
file_types=[".pdf"],
type="binary"
)
batch_size = gr.Number(
label="Batch Size",
value=5,
minimum=1,
maximum=10,
step=1,
info="Number of statements to process together (1-10)"
)
process_btn = gr.Button("Generate MCQs", variant="primary")
with gr.Column():
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=3
)
mcq_output = gr.Dataframe(
label="Generated MCQs",
headers=["Question", "Option A", "Option B", "Option C", "Option D", "Correct Answer"],
wrap=True,
# height=400
)
# Process button click
process_btn.click(
fn=process_pdf_interface,
inputs=[pdf_file, api_key, batch_size],
outputs=[mcq_output, status_output]
)
# Add download functionality
@gr.render(inputs=mcq_output)
def render_download_button(df):
if df is not None and not df.empty:
with gr.Row():
# Create a temporary file for download
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_file:
df.to_excel(tmp_file.name, index=False)
download_btn = gr.DownloadButton(
"πŸ“₯ Download as Excel",
value=tmp_file.name,
file_name="generated_mcqs.xlsx"
)
# For Hugging Face deployment
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0" if os.getenv("SPACE_ID") else None,
share=False
)