Spaces:

Ali-Raza-167
/

pdf-to-mcqs

Sleeping

App Files Files Community

pdf-to-mcqs / app.py

Ali-Raza-167

Update app.py

f7c8822 verified 4 months ago

raw

history blame contribute delete

9.04 kB

	import gradio as gr
	import os
	import pandas as pd
	from pdf2image import convert_from_path
	import pytesseract
	import google.generativeai as genai
	import re
	import tempfile
	from typing import List, Tuple

	class PDFToMCQGenerator:
	def __init__(self):
	self.model = None
	self.configured = False

	def configure_gemini(self, api_key: str):
	"""Configure Gemini with API key"""
	try:
	genai.configure(api_key=api_key)
	self.model = genai.GenerativeModel('gemini-pro')
	self.configured = True
	return "✅ Gemini configured successfully!"
	except Exception as e:
	return f"❌ Error configuring Gemini: {str(e)}"

	def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
	"""Extract text from PDF using OCR"""
	try:
	pages = convert_from_path(pdf_path)
	page_texts = []
	for page in pages:
	text = pytesseract.image_to_string(page)
	page_texts.append(text)
	return page_texts
	except Exception as e:
	raise Exception(f"Error extracting text from PDF: {str(e)}")

	def split_into_statements(self, page_texts: List[str]) -> List[str]:
	"""Split text into individual statements"""
	all_statements = []
	for page_text in page_texts:
	statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()]
	all_statements.extend(statements)
	return all_statements

	def batch_statements(self, statements: List[str], batch_size: int = 5) -> List[List[str]]:
	"""Batch statements into groups"""
	return [statements[i:i + batch_size] for i in range(0, len(statements), batch_size)]

	def generate_mcqs(self, text_block: str) -> List[List[str]]:
	"""Generate MCQs from text using Gemini"""
	if not self.configured:
	raise Exception("Gemini not configured. Please provide API key first.")

	prompt = f"""
	Generate exactly 5 MCQs from the following statements.
	Each question must have:
	- Clear, concise Question
	- 4 Options (A-D) with only one correct answer
	- Correct Answer (ONLY the letter A, B, C, or D — no text)

	Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer

	Text:
	{text_block}

	Example format:
	"What is the capital of France?","Paris","London","Berlin","Madrid","A"
	"Which planet is known as the Red Planet?","Earth","Mars","Jupiter","Venus","B"

	Important: Return ONLY the CSV data, no additional text.
	"""

	try:
	response = self.model.generate_content(prompt)
	mcq_data = []

	for line in response.text.strip().split('\n'):
	if line.strip() and not line.startswith('```'):
	parts = line.split(',')
	if len(parts) == 6:
	# Clean each part
	cleaned_parts = [part.strip().strip('"') for part in parts]
	mcq_data.append(cleaned_parts)

	return mcq_data
	except Exception as e:
	raise Exception(f"Error generating MCQs: {str(e)}")

	def process_pdf(self, pdf_file, api_key: str, batch_size: int = 5) -> Tuple[pd.DataFrame, str]:
	"""Main processing function"""
	# Configure Gemini
	config_status = self.configure_gemini(api_key)
	if not self.configured:
	return None, config_status

	try:
	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(pdf_file)
	pdf_path = tmp_file.name

	# Extract text
	page_texts = self.extract_text_from_pdf(pdf_path)
	statements = self.split_into_statements(page_texts)

	if len(statements) == 0:
	return None, "❌ No text could be extracted from the PDF. Please check if the PDF contains readable text."

	# Batch statements
	batches = self.batch_statements(statements, batch_size)

	# Generate MCQs
	all_mcqs = []
	successful_batches = 0

	for i, batch in enumerate(batches, 1):
	try:
	text_block = ". ".join(batch)
	mcqs = self.generate_mcqs(text_block)
	all_mcqs.extend(mcqs)
	successful_batches += 1
	except Exception as e:
	print(f"Batch {i} failed: {str(e)}")
	continue

	# Clean up temporary file
	os.unlink(pdf_path)

	if len(all_mcqs) == 0:
	return None, "❌ No MCQs could be generated. Please check your PDF content and try again."

	# Create DataFrame
	df = pd.DataFrame(all_mcqs, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])

	status_msg = f"✅ Successfully processed {successful_batches} batches and generated {len(all_mcqs)} MCQs!"
	return df, status_msg

	except Exception as e:
	# Clean up temporary file if it exists
	if 'pdf_path' in locals():
	try:
	os.unlink(pdf_path)
	except:
	pass
	return None, f"❌ Error processing PDF: {str(e)}"

	# Create generator instance
	generator = PDFToMCQGenerator()

	def process_pdf_interface(pdf_file, api_key, batch_size=5):
	"""Gradio interface function"""
	if pdf_file is None:
	return None, "❌ Please upload a PDF file."

	if not api_key.strip():
	return None, "❌ Please enter your Gemini API key."

	try:
	batch_size = int(batch_size)
	if batch_size < 1 or batch_size > 10:
	return None, "❌ Batch size must be between 1 and 10."
	except ValueError:
	return None, "❌ Batch size must be a number."

	df, status = generator.process_pdf(pdf_file, api_key, batch_size)

	if df is not None:
	# Return both the DataFrame and status message
	return df, status
	else:
	return None, status

	# Create Gradio interface
	with gr.Blocks(title="PPSC PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📚 PPSC PDF to MCQ Generator")
	gr.Markdown("Convert PDF content into multiple-choice questions using Google Gemini")

	with gr.Row():
	with gr.Column():
	api_key = gr.Textbox(
	label="Gemini API Key",
	type="password",
	placeholder="Enter your Google Gemini API key...",
	info="Get your API key from: https://aistudio.google.com/app/apikey"
	)

	pdf_file = gr.File(
	label="Upload PDF File",
	file_types=[".pdf"],
	type="binary"
	)

	batch_size = gr.Number(
	label="Batch Size",
	value=5,
	minimum=1,
	maximum=10,
	step=1,
	info="Number of statements to process together (1-10)"
	)

	process_btn = gr.Button("Generate MCQs", variant="primary")

	with gr.Column():
	status_output = gr.Textbox(
	label="Status",
	interactive=False,
	lines=3
	)

	mcq_output = gr.Dataframe(
	label="Generated MCQs",
	headers=["Question", "Option A", "Option B", "Option C", "Option D", "Correct Answer"],
	wrap=True,
	# height=400
	)

	# Process button click
	process_btn.click(
	fn=process_pdf_interface,
	inputs=[pdf_file, api_key, batch_size],
	outputs=[mcq_output, status_output]
	)

	# Add download functionality
	@gr.render(inputs=mcq_output)
	def render_download_button(df):
	if df is not None and not df.empty:
	with gr.Row():
	# Create a temporary file for download
	with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_file:
	df.to_excel(tmp_file.name, index=False)
	download_btn = gr.DownloadButton(
	"📥 Download as Excel",
	value=tmp_file.name,
	file_name="generated_mcqs.xlsx"
	)

	# For Hugging Face deployment
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0" if os.getenv("SPACE_ID") else None,
	share=False
	)