PDF-Processor

Sleeping

File size: 13,685 Bytes

95fb4d2
c0e8eab
 
 
bce1ee4
95fb4d2
 
 
 
 
238717d
95fb4d2
 
 
 
 
 
c0e8eab
65956db
95fb4d2
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
 
c0e8eab
54b1661
95fb4d2
c0e8eab
95fb4d2
c0e8eab
95fb4d2
 
c0e8eab
54b1661
95fb4d2
41c3d7f
 
 
 
95fb4d2
 
 
 
 
 
 
54b1661
95fb4d2
 
 
 
 
238717d
 
95fb4d2
238717d
 
95fb4d2
 
238717d
 
 
 
95fb4d2
 
 
 
 
 
c0e8eab
95fb4d2
c0e8eab
95fb4d2
 
 
 
41c3d7f
95fb4d2
 
 
 
41c3d7f
95fb4d2
 
41c3d7f
95fb4d2
 
 
 
41c3d7f
 
 
 
238717d
41c3d7f
238717d
41c3d7f
 
 
238717d
 
41c3d7f
238717d
 
 
 
41c3d7f
238717d
 
 
 
 
 
 
41c3d7f
95fb4d2
238717d
 
 
 
 
 
 
41c3d7f
 
 
 
 
238717d
41c3d7f
 
238717d
 
 
 
 
 
41c3d7f
 
 
 
 
95fb4d2
41c3d7f
95fb4d2
c0e8eab
54b1661
95fb4d2
c0e8eab
65956db
95fb4d2
 
65956db
95fb4d2
c0e8eab
95fb4d2
 
65956db
95fb4d2
 
65956db
95fb4d2
 
 
 
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
54b1661
95fb4d2
 
54b1661
 
95fb4d2
 
 
 
65956db
95fb4d2
238717d
 
 
 
 
 
 
 
 
95fb4d2
238717d
95fb4d2
65956db
 
 
95fb4d2
65956db
 
 
95fb4d2
 
 
54b1661
95fb4d2
 
 
c0e8eab
65956db
c0e8eab
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
 
 
 
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
95fb4d2
54b1661
95fb4d2
 
 
 
54b1661
 
 
 
 
 
 
 
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
95fb4d2
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
54b1661
 
95fb4d2
 
 
 
 
54b1661
 
95fb4d2
 
 
54b1661
95fb4d2
 
 
 
c0e8eab
95fb4d2
c0e8eab
95fb4d2

import os
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
import tempfile
import io
import gradio as gr
import google.generativeai as genai
from typing import List, Tuple
import time
import csv

# Configure Gemini API
def configure_gemini_api(api_key: str):
    """Configure the Gemini API with the provided key"""
    genai.configure(api_key=api_key)
    return "✅ API Key configured successfully!"

def extract_text_from_pdf(pdf_file_path: str) -> str:
    """Extract text from PDF using OCR"""
    try:
        # Convert PDF to images
        pages = convert_from_path(pdf_file_path)
        all_text = ""
        
        for i, page in enumerate(pages):
            text = pytesseract.image_to_string(page)
            all_text += text + "\n"
        
        return all_text
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def chunk_text(text: str, chunk_size: int = 500) -> List[str]:  # Changed default to 500 for more chunks/MCQs
    """Split text into chunks for processing"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]:  # Added mcqs_per_chunk param, default 20
    """Generate MCQs from a text chunk using Gemini API"""
    print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
    print(f"Chunk length: {len(chunk)} characters")
    print(f"Chunk preview: {chunk[:200]}...")
    
    models_to_try = [
        'gemini-2.0-flash-exp',
        'gemini-1.5-flash',
        'gemini-1.5-pro'
    ]
    
    prompt = f"""
    Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
    Each question must have:
    - A clear, specific question
    - 4 options labeled A, B, C, D
    - One correct answer (A, B, C, or D)
    
    IMPORTANT: Do NOT include any headers or column names in your response.
    Format each question as: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
    
    Rules:
    - Start directly with the first question, no headers
    - Use commas only as field separators
    - If any field contains a comma, wrap it in double quotes
    - Each question should be on a new line
    - Make questions specific and clear based on the text content
    - Ensure all 4 options are plausible but only one is correct
    - The correct answer should be A, B, C, or D only
    
    Text to analyze:
    {chunk}
    """
    
    # Configure API
    genai.configure(api_key=api_key)
    
    mcq_data = []
    response = None
    
    for model_name in models_to_try:
        try:
            print(f"Trying model: {model_name}")
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)
            
            if response.text:
                print(f"✅ Successfully used model: {model_name}")
                break
        except Exception as e:
            print(f"❌ Error with {model_name}: {e}")
            continue
    
    if response and response.text:
        output = response.text.strip()
        print(f"\n--- RAW AI RESPONSE FOR CHUNK {chunk_number} ---")
        print(output)
        print("--- END RAW RESPONSE ---\n")
        
        lines = [line.strip() for line in output.splitlines() if line.strip()]
        print(f"Total non-empty lines in response: {len(lines)}")
        
        for idx, line in enumerate(lines):
            print(f"Processing line {idx + 1}: {line[:100]}...")
            
            # Skip any header lines that might still appear
            if ('Question' in line and 'OptionA' in line and 'OptionB' in line) or line.startswith('Question,'):
                print(f"❌ Skipped header line: {line[:50]}...")
                continue
            
            # Skip empty lines or lines that don't look like MCQs
            if not line or line.count(',') < 5:
                print(f"❌ Skipped invalid line (comma count: {line.count(',')}): {line[:50]}...")
                continue
            
            # Parse CSV line using proper CSV parsing
            try:
                # Use StringIO to parse the line as CSV
                csv_reader = csv.reader([line])
                parts = next(csv_reader)
                print(f"Parsed parts: {len(parts)} fields")
                
                # Ensure we have exactly 6 parts and the question is not empty
                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
                    # Clean up each part
                    cleaned_parts = [part.strip() for part in parts[:6]]
                    # Validate that correct answer is A, B, C, or D
                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
                        mcq_data.append(cleaned_parts)
                        print(f"✅ Added MCQ: {cleaned_parts[0][:50]}... (Answer: {cleaned_parts[5]})")
                    else:
                        print(f"❌ Invalid answer format: {cleaned_parts[5]}")
                else:
                    print(f"❌ Invalid parts count or empty question. Parts: {len(parts)}, First part: '{parts[0] if parts else 'N/A'}'")
                        
            except csv.Error as e:
                print(f"❌ CSV parsing error: {e}")
                # Fallback to simple split if CSV parsing fails
                parts = line.split(',')
                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
                    cleaned_parts = [part.strip().strip('"') for part in parts[:6]]
                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
                        mcq_data.append(cleaned_parts)
                        print(f"✅ Added MCQ (fallback): {cleaned_parts[0][:50]}...")
                    else:
                        print(f"❌ Invalid answer format (fallback): {cleaned_parts[5]}")
    else:
        print(f"❌ No response received for chunk {chunk_number}")
    
    print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
    return mcq_data

def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]:  # Added mcqs_per_chunk param, default 20
    """Main function to process PDF and generate MCQs"""
    if not api_key:
        return "❌ Please provide your Gemini API key", None
    
    if not pdf_file:
        return "❌ Please upload a PDF file", None
    
    try:
        # Extract text from PDF
        progress(0.1, desc="Extracting text from PDF...")
        extracted_text = extract_text_from_pdf(pdf_file.name)
        
        if extracted_text.startswith("Error"):
            return extracted_text, None
        
        # Chunk the text
        progress(0.2, desc="Chunking text...")
        chunks = chunk_text(extracted_text, chunk_size)
        
        if not chunks:
            return "❌ No text could be extracted from the PDF", None
        
        # Generate MCQs from each chunk
        all_mcq_data = []
        total_chunks = len(chunks)
        
        for i, chunk in enumerate(chunks):
            progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
            
            chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
            all_mcq_data.extend(chunk_mcqs)
            
            # Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
            time.sleep(0.5)
        
        progress(0.95, desc="Creating Excel file...")
        
        if not all_mcq_data:
            return "❌ No MCQs could be generated from the PDF content", None
        
        # Remove any duplicate questions
        seen_questions = set()
        unique_mcq_data = []
        for mcq in all_mcq_data:
            question_text = mcq[0].lower().strip()
            if question_text not in seen_questions:
                seen_questions.add(question_text)
                unique_mcq_data.append(mcq)
        
        # Create DataFrame
        df = pd.DataFrame(unique_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
        
        # Create temporary Excel file for download
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
        temp_file.close()  # Close to allow pandas to write to it
        
        # Write Excel file
        with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='MCQs')
        
        progress(1.0, desc="Complete!")
        
        success_message = f"✅ Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"
        
        return success_message, temp_file.name
        
    except Exception as e:
        return f"❌ Error processing PDF: {str(e)}", None

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # 📚 PDF to MCQ Generator
            
            Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.
            
            ## How to use:
            1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
            2. Enter your API key below
            3. Upload your PDF file
            4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
            5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
            6. Click "Generate MCQs" and wait for processing
            7. Download the generated Excel file with your MCQs
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                api_key_input = gr.Textbox(
                    label="🔑 Gemini API Key",
                    placeholder="Enter your Gemini API key here...",
                    type="password"
                )
                
                pdf_input = gr.File(
                    label="📄 Upload PDF File",
                    file_types=[".pdf"]
                )
                
                chunk_size_input = gr.Slider(
                    minimum=300,  # Lowered min to allow even smaller chunks
                    maximum=3000,
                    value=500,  # Changed default to 500 for more chunks
                    step=100,
                    label="📝 Chunk Size (words per processing batch)"
                )
                
                mcqs_per_chunk_input = gr.Slider(
                    minimum=5,
                    maximum=50,  # Increased max for more MCQs per chunk
                    value=20,  # New slider for MCQs per chunk, default 20
                    step=5,
                    label="🔢 MCQs per Chunk (higher = more MCQs, but may increase failures)"
                )
                
                generate_btn = gr.Button(
                    "🚀 Generate MCQs",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                status_output = gr.Textbox(
                    label="📊 Status",
                    placeholder="Status updates will appear here...",
                    lines=10
                )
                
                download_file = gr.File(
                    label="⬇️ Download MCQs Excel File",
                    visible=False
                )
        
        # Event handlers
        generate_btn.click(
            fn=process_pdf_to_mcqs,
            inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
            outputs=[status_output, download_file],
            show_progress=True
        ).then(
            fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
            inputs=[download_file],
            outputs=[download_file]
        )
        
        gr.Markdown(
            """
            ## 📋 Features:
            - **OCR Text Extraction**: Converts PDF pages to images and extracts text
            - **Smart Chunking**: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
            - **Configurable MCQs per Chunk**: Now adjustable up to 50 for maximum generation
            - **Multiple AI Models**: Automatically tries different Gemini models for best results
            - **Excel Output**: Download MCQs in a formatted Excel file
            - **Progress Tracking**: Real-time updates on processing status
            
            ## ⚠️ Notes:
            - To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
            - Processing time depends on PDF length and settings (more MCQs = longer time)
            - Large PDFs are processed in chunks to avoid timeouts
            - Make sure your PDF contains readable text (not just images)
            - API key is not stored and only used for your session
            - Reduced delay between API calls for faster processing, but monitor for rate limits
            """
        )
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)