PDF-Processor

Sleeping

App Files Files Community

PDF-Processor / app.py

167AliRaza

Update app.py

95fb4d2 verified 7 months ago

raw

history blame

9.8 kB

	import os
	import pandas as pd
	from pdf2image import convert_from_path
	import pytesseract
	import tempfile
	import io
	import gradio as gr
	import google.generativeai as genai
	from typing import List, Tuple
	import time

	# Configure Gemini API
	def configure_gemini_api(api_key: str):
	"""Configure the Gemini API with the provided key"""
	genai.configure(api_key=api_key)
	return "✅ API Key configured successfully!"

	def extract_text_from_pdf(pdf_file) -> str:
	"""Extract text from PDF using OCR"""
	try:
	# Create temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(pdf_file)
	tmp_path = tmp_file.name

	# Convert PDF to images
	pages = convert_from_path(tmp_path)
	all_text = ""

	for i, page in enumerate(pages):
	text = pytesseract.image_to_string(page)
	all_text += text + "\n"

	# Clean up temporary file
	os.unlink(tmp_path)

	return all_text
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
	"""Split text into chunks for processing"""
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size):
	chunks.append(' '.join(words[i:i+chunk_size]))
	return chunks

	def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
	"""Generate MCQs from a text chunk using Gemini API"""
	models_to_try = [
	'gemini-2.0-flash-exp',
	'gemini-1.5-flash',
	'gemini-1.5-pro'
	]

	prompt = f"""
	Generate 10 multiple choice questions from the following text.
	Each question must have:
	- A clear, specific question
	- 4 options labeled A, B, C, D
	- One correct answer (A, B, C, or D)

	Format your response as CSV with headers: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer

	Important formatting rules:
	- Use commas only as field separators
	- If any field contains a comma, wrap it in double quotes
	- Each row should be on a new line
	- Make questions specific and clear
	- Ensure options are distinct and plausible

	Text to analyze:
	{chunk}
	"""

	# Configure API
	genai.configure(api_key=api_key)

	mcq_data = []
	response = None

	for model_name in models_to_try:
	try:
	model = genai.GenerativeModel(model_name)
	response = model.generate_content(prompt)

	if response.text:
	break
	except Exception as e:
	print(f"Error with {model_name}: {e}")
	continue

	if response and response.text:
	output = response.text.strip()
	lines = output.splitlines()

	# Skip header if present
	for line in lines[1:] if lines and 'Question' in lines[0] else lines:
	if line.strip():
	# Simple CSV parsing (you might want to use csv module for better handling)
	parts = []
	current_part = ""
	in_quotes = False

	for char in line:
	if char == '"':
	in_quotes = not in_quotes
	elif char == ',' and not in_quotes:
	parts.append(current_part.strip().strip('"'))
	current_part = ""
	else:
	current_part += char

	# Add the last part
	if current_part:
	parts.append(current_part.strip().strip('"'))

	if len(parts) >= 6 and parts[0].strip():
	mcq_data.append(parts[:6])

	return mcq_data

	def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
	"""Main function to process PDF and generate MCQs"""
	if not api_key:
	return "❌ Please provide your Gemini API key", ""

	if not pdf_file:
	return "❌ Please upload a PDF file", ""

	try:
	# Extract text from PDF
	progress(0.1, desc="Extracting text from PDF...")
	extracted_text = extract_text_from_pdf(pdf_file)

	if extracted_text.startswith("Error"):
	return extracted_text, ""

	# Chunk the text
	progress(0.2, desc="Chunking text...")
	chunks = chunk_text(extracted_text, chunk_size)

	if not chunks:
	return "❌ No text could be extracted from the PDF", ""

	# Generate MCQs from each chunk
	all_mcq_data = []
	total_chunks = len(chunks)

	for i, chunk in enumerate(chunks):
	progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")

	chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
	all_mcq_data.extend(chunk_mcqs)

	# Add small delay to avoid rate limiting
	time.sleep(1)

	progress(0.95, desc="Creating Excel file...")

	if not all_mcq_data:
	return "❌ No MCQs could be generated from the PDF content", ""

	# Create DataFrame
	df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])

	# Create Excel file in memory
	output = io.BytesIO()
	with pd.ExcelWriter(output, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name='MCQs')

	output.seek(0)

	# Save to temporary file for download
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
	temp_file.write(output.getvalue())
	temp_file.close()

	progress(1.0, desc="Complete!")

	success_message = f"✅ Successfully generated {len(all_mcq_data)} MCQs from {total_chunks} text chunks!"

	return success_message, temp_file.name

	except Exception as e:
	return f"❌ Error processing PDF: {str(e)}", ""

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 📚 PDF to MCQ Generator

	Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.

	## How to use:
	1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
	2. Enter your API key below
	3. Upload your PDF file
	4. Adjust chunk size if needed (larger = fewer API calls, smaller = more focused questions)
	5. Click "Generate MCQs" and wait for processing
	6. Download the generated Excel file with your MCQs
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	api_key_input = gr.Textbox(
	label="🔑 Gemini API Key",
	placeholder="Enter your Gemini API key here...",
	type="password"
	)

	pdf_input = gr.File(
	label="📄 Upload PDF File",
	file_types=[".pdf"]
	)

	chunk_size_input = gr.Slider(
	minimum=500,
	maximum=3000,
	value=1500,
	step=100,
	label="📝 Chunk Size (words per processing batch)"
	)

	generate_btn = gr.Button(
	"🚀 Generate MCQs",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	status_output = gr.Textbox(
	label="📊 Status",
	placeholder="Status updates will appear here...",
	lines=10
	)

	download_file = gr.File(
	label="⬇️ Download MCQs Excel File",
	visible=False
	)

	# Event handlers
	generate_btn.click(
	fn=process_pdf_to_mcqs,
	inputs=[pdf_input, api_key_input, chunk_size_input],
	outputs=[status_output, download_file],
	show_progress=True
	).then(
	fn=lambda x: gr.update(visible=bool(x)),
	inputs=[download_file],
	outputs=[download_file]
	)

	gr.Markdown(
	"""
	## 📋 Features:
	- OCR Text Extraction: Converts PDF pages to images and extracts text
	- Smart Chunking: Breaks large documents into manageable pieces
	- Multiple AI Models: Automatically tries different Gemini models for best results
	- Excel Output: Download MCQs in a formatted Excel file
	- Progress Tracking: Real-time updates on processing status

	## ⚠️ Notes:
	- Processing time depends on PDF length and complexity
	- Large PDFs are processed in chunks to avoid timeouts
	- Make sure your PDF contains readable text (not just images)
	- API key is not stored and only used for your session
	"""
	)

	return demo

	# Launch the app
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860)