PDF-Processor / app.py
167AliRaza's picture
Update app.py
95fb4d2 verified
raw
history blame
9.8 kB
import os
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
import tempfile
import io
import gradio as gr
import google.generativeai as genai
from typing import List, Tuple
import time
# Configure Gemini API
def configure_gemini_api(api_key: str):
"""Configure the Gemini API with the provided key"""
genai.configure(api_key=api_key)
return "βœ… API Key configured successfully!"
def extract_text_from_pdf(pdf_file) -> str:
"""Extract text from PDF using OCR"""
try:
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(pdf_file)
tmp_path = tmp_file.name
# Convert PDF to images
pages = convert_from_path(tmp_path)
all_text = ""
for i, page in enumerate(pages):
text = pytesseract.image_to_string(page)
all_text += text + "\n"
# Clean up temporary file
os.unlink(tmp_path)
return all_text
except Exception as e:
return f"Error extracting text: {str(e)}"
def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
"""Split text into chunks for processing"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size):
chunks.append(' '.join(words[i:i+chunk_size]))
return chunks
def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
"""Generate MCQs from a text chunk using Gemini API"""
models_to_try = [
'gemini-2.0-flash-exp',
'gemini-1.5-flash',
'gemini-1.5-pro'
]
prompt = f"""
Generate 10 multiple choice questions from the following text.
Each question must have:
- A clear, specific question
- 4 options labeled A, B, C, D
- One correct answer (A, B, C, or D)
Format your response as CSV with headers: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
Important formatting rules:
- Use commas only as field separators
- If any field contains a comma, wrap it in double quotes
- Each row should be on a new line
- Make questions specific and clear
- Ensure options are distinct and plausible
Text to analyze:
{chunk}
"""
# Configure API
genai.configure(api_key=api_key)
mcq_data = []
response = None
for model_name in models_to_try:
try:
model = genai.GenerativeModel(model_name)
response = model.generate_content(prompt)
if response.text:
break
except Exception as e:
print(f"Error with {model_name}: {e}")
continue
if response and response.text:
output = response.text.strip()
lines = output.splitlines()
# Skip header if present
for line in lines[1:] if lines and 'Question' in lines[0] else lines:
if line.strip():
# Simple CSV parsing (you might want to use csv module for better handling)
parts = []
current_part = ""
in_quotes = False
for char in line:
if char == '"':
in_quotes = not in_quotes
elif char == ',' and not in_quotes:
parts.append(current_part.strip().strip('"'))
current_part = ""
else:
current_part += char
# Add the last part
if current_part:
parts.append(current_part.strip().strip('"'))
if len(parts) >= 6 and parts[0].strip():
mcq_data.append(parts[:6])
return mcq_data
def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
"""Main function to process PDF and generate MCQs"""
if not api_key:
return "❌ Please provide your Gemini API key", ""
if not pdf_file:
return "❌ Please upload a PDF file", ""
try:
# Extract text from PDF
progress(0.1, desc="Extracting text from PDF...")
extracted_text = extract_text_from_pdf(pdf_file)
if extracted_text.startswith("Error"):
return extracted_text, ""
# Chunk the text
progress(0.2, desc="Chunking text...")
chunks = chunk_text(extracted_text, chunk_size)
if not chunks:
return "❌ No text could be extracted from the PDF", ""
# Generate MCQs from each chunk
all_mcq_data = []
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
all_mcq_data.extend(chunk_mcqs)
# Add small delay to avoid rate limiting
time.sleep(1)
progress(0.95, desc="Creating Excel file...")
if not all_mcq_data:
return "❌ No MCQs could be generated from the PDF content", ""
# Create DataFrame
df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
# Create Excel file in memory
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='MCQs')
output.seek(0)
# Save to temporary file for download
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
temp_file.write(output.getvalue())
temp_file.close()
progress(1.0, desc="Complete!")
success_message = f"βœ… Successfully generated {len(all_mcq_data)} MCQs from {total_chunks} text chunks!"
return success_message, temp_file.name
except Exception as e:
return f"❌ Error processing PDF: {str(e)}", ""
# Create Gradio interface
def create_interface():
with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸ“š PDF to MCQ Generator
Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.
## How to use:
1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
2. Enter your API key below
3. Upload your PDF file
4. Adjust chunk size if needed (larger = fewer API calls, smaller = more focused questions)
5. Click "Generate MCQs" and wait for processing
6. Download the generated Excel file with your MCQs
"""
)
with gr.Row():
with gr.Column(scale=2):
api_key_input = gr.Textbox(
label="πŸ”‘ Gemini API Key",
placeholder="Enter your Gemini API key here...",
type="password"
)
pdf_input = gr.File(
label="πŸ“„ Upload PDF File",
file_types=[".pdf"]
)
chunk_size_input = gr.Slider(
minimum=500,
maximum=3000,
value=1500,
step=100,
label="πŸ“ Chunk Size (words per processing batch)"
)
generate_btn = gr.Button(
"πŸš€ Generate MCQs",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
status_output = gr.Textbox(
label="πŸ“Š Status",
placeholder="Status updates will appear here...",
lines=10
)
download_file = gr.File(
label="⬇️ Download MCQs Excel File",
visible=False
)
# Event handlers
generate_btn.click(
fn=process_pdf_to_mcqs,
inputs=[pdf_input, api_key_input, chunk_size_input],
outputs=[status_output, download_file],
show_progress=True
).then(
fn=lambda x: gr.update(visible=bool(x)),
inputs=[download_file],
outputs=[download_file]
)
gr.Markdown(
"""
## πŸ“‹ Features:
- **OCR Text Extraction**: Converts PDF pages to images and extracts text
- **Smart Chunking**: Breaks large documents into manageable pieces
- **Multiple AI Models**: Automatically tries different Gemini models for best results
- **Excel Output**: Download MCQs in a formatted Excel file
- **Progress Tracking**: Real-time updates on processing status
## ⚠️ Notes:
- Processing time depends on PDF length and complexity
- Large PDFs are processed in chunks to avoid timeouts
- Make sure your PDF contains readable text (not just images)
- API key is not stored and only used for your session
"""
)
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860)