PDF-Processor

Sleeping

App Files Files Community

PDF-Processor / app.py

167AliRaza

Update app.py

54b1661 verified 3 months ago

raw

history blame contribute delete

13.7 kB

	import os
	import pandas as pd
	from pdf2image import convert_from_path
	import pytesseract
	import tempfile
	import io
	import gradio as gr
	import google.generativeai as genai
	from typing import List, Tuple
	import time
	import csv

	# Configure Gemini API
	def configure_gemini_api(api_key: str):
	"""Configure the Gemini API with the provided key"""
	genai.configure(api_key=api_key)
	return "✅ API Key configured successfully!"

	def extract_text_from_pdf(pdf_file_path: str) -> str:
	"""Extract text from PDF using OCR"""
	try:
	# Convert PDF to images
	pages = convert_from_path(pdf_file_path)
	all_text = ""

	for i, page in enumerate(pages):
	text = pytesseract.image_to_string(page)
	all_text += text + "\n"

	return all_text
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	def chunk_text(text: str, chunk_size: int = 500) -> List[str]: # Changed default to 500 for more chunks/MCQs
	"""Split text into chunks for processing"""
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size):
	chunks.append(' '.join(words[i:i+chunk_size]))
	return chunks

	def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]: # Added mcqs_per_chunk param, default 20
	"""Generate MCQs from a text chunk using Gemini API"""
	print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
	print(f"Chunk length: {len(chunk)} characters")
	print(f"Chunk preview: {chunk[:200]}...")

	models_to_try = [
	'gemini-2.0-flash-exp',
	'gemini-1.5-flash',
	'gemini-1.5-pro'
	]

	prompt = f"""
	Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
	Each question must have:
	- A clear, specific question
	- 4 options labeled A, B, C, D
	- One correct answer (A, B, C, or D)

	IMPORTANT: Do NOT include any headers or column names in your response.
	Format each question as: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer

	Rules:
	- Start directly with the first question, no headers
	- Use commas only as field separators
	- If any field contains a comma, wrap it in double quotes
	- Each question should be on a new line
	- Make questions specific and clear based on the text content
	- Ensure all 4 options are plausible but only one is correct
	- The correct answer should be A, B, C, or D only

	Text to analyze:
	{chunk}
	"""

	# Configure API
	genai.configure(api_key=api_key)

	mcq_data = []
	response = None

	for model_name in models_to_try:
	try:
	print(f"Trying model: {model_name}")
	model = genai.GenerativeModel(model_name)
	response = model.generate_content(prompt)

	if response.text:
	print(f"✅ Successfully used model: {model_name}")
	break
	except Exception as e:
	print(f"❌ Error with {model_name}: {e}")
	continue

	if response and response.text:
	output = response.text.strip()
	print(f"\n--- RAW AI RESPONSE FOR CHUNK {chunk_number} ---")
	print(output)
	print("--- END RAW RESPONSE ---\n")

	lines = [line.strip() for line in output.splitlines() if line.strip()]
	print(f"Total non-empty lines in response: {len(lines)}")

	for idx, line in enumerate(lines):
	print(f"Processing line {idx + 1}: {line[:100]}...")

	# Skip any header lines that might still appear
	if ('Question' in line and 'OptionA' in line and 'OptionB' in line) or line.startswith('Question,'):
	print(f"❌ Skipped header line: {line[:50]}...")
	continue

	# Skip empty lines or lines that don't look like MCQs
	if not line or line.count(',') < 5:
	print(f"❌ Skipped invalid line (comma count: {line.count(',')}): {line[:50]}...")
	continue

	# Parse CSV line using proper CSV parsing
	try:
	# Use StringIO to parse the line as CSV
	csv_reader = csv.reader([line])
	parts = next(csv_reader)
	print(f"Parsed parts: {len(parts)} fields")

	# Ensure we have exactly 6 parts and the question is not empty
	if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
	# Clean up each part
	cleaned_parts = [part.strip() for part in parts[:6]]
	# Validate that correct answer is A, B, C, or D
	if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
	mcq_data.append(cleaned_parts)
	print(f"✅ Added MCQ: {cleaned_parts[0][:50]}... (Answer: {cleaned_parts[5]})")
	else:
	print(f"❌ Invalid answer format: {cleaned_parts[5]}")
	else:
	print(f"❌ Invalid parts count or empty question. Parts: {len(parts)}, First part: '{parts[0] if parts else 'N/A'}'")

	except csv.Error as e:
	print(f"❌ CSV parsing error: {e}")
	# Fallback to simple split if CSV parsing fails
	parts = line.split(',')
	if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
	cleaned_parts = [part.strip().strip('"') for part in parts[:6]]
	if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
	mcq_data.append(cleaned_parts)
	print(f"✅ Added MCQ (fallback): {cleaned_parts[0][:50]}...")
	else:
	print(f"❌ Invalid answer format (fallback): {cleaned_parts[5]}")
	else:
	print(f"❌ No response received for chunk {chunk_number}")

	print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
	return mcq_data

	def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]: # Added mcqs_per_chunk param, default 20
	"""Main function to process PDF and generate MCQs"""
	if not api_key:
	return "❌ Please provide your Gemini API key", None

	if not pdf_file:
	return "❌ Please upload a PDF file", None

	try:
	# Extract text from PDF
	progress(0.1, desc="Extracting text from PDF...")
	extracted_text = extract_text_from_pdf(pdf_file.name)

	if extracted_text.startswith("Error"):
	return extracted_text, None

	# Chunk the text
	progress(0.2, desc="Chunking text...")
	chunks = chunk_text(extracted_text, chunk_size)

	if not chunks:
	return "❌ No text could be extracted from the PDF", None

	# Generate MCQs from each chunk
	all_mcq_data = []
	total_chunks = len(chunks)

	for i, chunk in enumerate(chunks):
	progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")

	chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
	all_mcq_data.extend(chunk_mcqs)

	# Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
	time.sleep(0.5)

	progress(0.95, desc="Creating Excel file...")

	if not all_mcq_data:
	return "❌ No MCQs could be generated from the PDF content", None

	# Remove any duplicate questions
	seen_questions = set()
	unique_mcq_data = []
	for mcq in all_mcq_data:
	question_text = mcq[0].lower().strip()
	if question_text not in seen_questions:
	seen_questions.add(question_text)
	unique_mcq_data.append(mcq)

	# Create DataFrame
	df = pd.DataFrame(unique_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])

	# Create temporary Excel file for download
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
	temp_file.close() # Close to allow pandas to write to it

	# Write Excel file
	with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name='MCQs')

	progress(1.0, desc="Complete!")

	success_message = f"✅ Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"

	return success_message, temp_file.name

	except Exception as e:
	return f"❌ Error processing PDF: {str(e)}", None

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 📚 PDF to MCQ Generator

	Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.

	## How to use:
	1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
	2. Enter your API key below
	3. Upload your PDF file
	4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
	5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
	6. Click "Generate MCQs" and wait for processing
	7. Download the generated Excel file with your MCQs
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	api_key_input = gr.Textbox(
	label="🔑 Gemini API Key",
	placeholder="Enter your Gemini API key here...",
	type="password"
	)

	pdf_input = gr.File(
	label="📄 Upload PDF File",
	file_types=[".pdf"]
	)

	chunk_size_input = gr.Slider(
	minimum=300, # Lowered min to allow even smaller chunks
	maximum=3000,
	value=500, # Changed default to 500 for more chunks
	step=100,
	label="📝 Chunk Size (words per processing batch)"
	)

	mcqs_per_chunk_input = gr.Slider(
	minimum=5,
	maximum=50, # Increased max for more MCQs per chunk
	value=20, # New slider for MCQs per chunk, default 20
	step=5,
	label="🔢 MCQs per Chunk (higher = more MCQs, but may increase failures)"
	)

	generate_btn = gr.Button(
	"🚀 Generate MCQs",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	status_output = gr.Textbox(
	label="📊 Status",
	placeholder="Status updates will appear here...",
	lines=10
	)

	download_file = gr.File(
	label="⬇️ Download MCQs Excel File",
	visible=False
	)

	# Event handlers
	generate_btn.click(
	fn=process_pdf_to_mcqs,
	inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
	outputs=[status_output, download_file],
	show_progress=True
	).then(
	fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
	inputs=[download_file],
	outputs=[download_file]
	)

	gr.Markdown(
	"""
	## 📋 Features:
	- OCR Text Extraction: Converts PDF pages to images and extracts text
	- Smart Chunking: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
	- Configurable MCQs per Chunk: Now adjustable up to 50 for maximum generation
	- Multiple AI Models: Automatically tries different Gemini models for best results
	- Excel Output: Download MCQs in a formatted Excel file
	- Progress Tracking: Real-time updates on processing status

	## ⚠️ Notes:
	- To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
	- Processing time depends on PDF length and settings (more MCQs = longer time)
	- Large PDFs are processed in chunks to avoid timeouts
	- Make sure your PDF contains readable text (not just images)
	- API key is not stored and only used for your session
	- Reduced delay between API calls for faster processing, but monitor for rate limits
	"""
	)

	return demo

	# Launch the app
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860)