PDF-Processor

Sleeping

App Files Files Community

167AliRaza commited on Sep 18, 2025

Commit

95fb4d2

verified ·

1 Parent(s): 478c204

Update app.py

Browse files

Files changed (1) hide show

app.py +263 -110

app.py CHANGED Viewed

@@ -1,128 +1,281 @@
-import gradio as gr
 import pandas as pd
 from pdf2image import convert_from_path
 import pytesseract
-import google.generativeai as genai
 import tempfile
-import csv
-from io import StringIO
-# Function: Extract text from PDF using OCR
-def extract_text_from_pdf(pdf_file):
-    pages = convert_from_path(pdf_file)
-    all_text = ""
-    for page in pages:
-        text = pytesseract.image_to_string(page)
-        all_text += text + "\n"
-    return all_text.strip()
-# Function: Chunk text
-def chunk_text(text, chunk_size=1500):
     words = text.split()
     for i in range(0, len(words), chunk_size):
-        yield " ".join(words[i:i+chunk_size])
-# Models to try (fallbacks)
-models_to_try = [
-    "gemini-2.5-flash-lite",
-    "gemini-2.5-flash",
-    "gemini-2.5-pro",
-    "gemini-2.0-flash-lite",
-    "gemini-2.0-flash",
-    "gemini-1.5-flash",
-    "gemini-1.5-pro",
-]
-# Function: Generate MCQs
-def generate_mcqs(text, api_key):
     genai.configure(api_key=api_key)
-    chunks = list(chunk_text(text, 1500))
     mcq_data = []
-    for i, chunk in enumerate(chunks, start=1):
-        prompt = f"""
-Generate 10 MCQs from the following text.
-Return ONLY valid CSV rows with exactly 6 columns:
-Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
-Rules:
-- Do NOT add numbering, quotes, or explanations.
-- Do NOT add headers.
-- Do NOT add extra commas inside cells.
-- Exactly 10 rows per chunk.
-Text:\n{chunk}
-"""
-        response = None
-        for model_name in models_to_try:
-            try:
-                model = genai.GenerativeModel(model_name)
-                response = model.generate_content(prompt)
-                if response.text:
-                    break
-            except Exception:
-                continue
-        if response and response.text:
-            output = response.text.strip()
-            try:
-                reader = csv.reader(StringIO(output))
-                for row in reader:
-                    if len(row) >= 6 and row[0]:
-                        mcq_data.append(row[:6])  # keep only first 6 cols
-            except Exception:
-                continue
-    if not mcq_data:
-        return None, None
-    df = pd.DataFrame(
-        mcq_data,
-        columns=["Question", "OptionA", "OptionB", "OptionC", "OptionD", "CorrectAnswer"],
-    )
-    return df, df.head(10).to_string(index=False)
-# Gradio pipeline
-def process_pdf(pdf_file, api_key):
     if not api_key:
-        return "❌ Please enter your Gemini API key.", None
     try:
-        text = extract_text_from_pdf(pdf_file.name)
-        df, preview = generate_mcqs(text, api_key)
-        if df is None:
-            return "❌ No valid MCQs generated.", None
-        # Save to a temporary Excel file
-        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
-        df.to_excel(tmp_file.name, index=False)
-        return preview, tmp_file.name
     except Exception as e:
-        return f"Error: {str(e)}", None
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## 📘 PDF to MCQ Generator (Gemini AI)")
-    gr.Markdown(
-        "Upload a PDF, enter your Gemini API key, extract text with OCR, and generate MCQs saved as Excel."
-    )
-    api_key = gr.Textbox(label="Enter your Gemini API Key", type="password")
-    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-    generate_btn = gr.Button("Generate MCQs")
-    preview_output = gr.Textbox(label="Preview (First 10 MCQs)", lines=15)
-    excel_output = gr.File(label="Download Excel (.xlsx)")
-    generate_btn.click(
-        fn=process_pdf,
-        inputs=[pdf_input, api_key],
-        outputs=[preview_output, excel_output],
-    )
-# Run app
 if __name__ == "__main__":
-    demo.launch()

+import os
 import pandas as pd
 from pdf2image import convert_from_path
 import pytesseract
 import tempfile
+import io
+import gradio as gr
+import google.generativeai as genai
+from typing import List, Tuple
+import time
+# Configure Gemini API
+def configure_gemini_api(api_key: str):
+    """Configure the Gemini API with the provided key"""
+    genai.configure(api_key=api_key)
+    return "✅ API Key configured successfully!"
+def extract_text_from_pdf(pdf_file) -> str:
+    """Extract text from PDF using OCR"""
+    try:
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(pdf_file)
+            tmp_path = tmp_file.name
+        # Convert PDF to images
+        pages = convert_from_path(tmp_path)
+        all_text = ""
+        for i, page in enumerate(pages):
+            text = pytesseract.image_to_string(page)
+            all_text += text + "\n"
+        # Clean up temporary file
+        os.unlink(tmp_path)
+        return all_text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
+    """Split text into chunks for processing"""
     words = text.split()
+    chunks = []
     for i in range(0, len(words), chunk_size):
+        chunks.append(' '.join(words[i:i+chunk_size]))
+    return chunks
+def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
+    """Generate MCQs from a text chunk using Gemini API"""
+    models_to_try = [
+        'gemini-2.0-flash-exp',
+        'gemini-1.5-flash',
+        'gemini-1.5-pro'
+    ]
+    prompt = f"""
+    Generate 10 multiple choice questions from the following text.
+    Each question must have:
+    - A clear, specific question
+    - 4 options labeled A, B, C, D
+    - One correct answer (A, B, C, or D)
+    Format your response as CSV with headers: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
+    Important formatting rules:
+    - Use commas only as field separators
+    - If any field contains a comma, wrap it in double quotes
+    - Each row should be on a new line
+    - Make questions specific and clear
+    - Ensure options are distinct and plausible
+    Text to analyze:
+    {chunk}
+    """
+    # Configure API
     genai.configure(api_key=api_key)
     mcq_data = []
+    response = None
+    for model_name in models_to_try:
+        try:
+            model = genai.GenerativeModel(model_name)
+            response = model.generate_content(prompt)
+            if response.text:
+                break
+        except Exception as e:
+            print(f"Error with {model_name}: {e}")
+            continue
+    if response and response.text:
+        output = response.text.strip()
+        lines = output.splitlines()
+        # Skip header if present
+        for line in lines[1:] if lines and 'Question' in lines[0] else lines:
+            if line.strip():
+                # Simple CSV parsing (you might want to use csv module for better handling)
+                parts = []
+                current_part = ""
+                in_quotes = False
+                for char in line:
+                    if char == '"':
+                        in_quotes = not in_quotes
+                    elif char == ',' and not in_quotes:
+                        parts.append(current_part.strip().strip('"'))
+                        current_part = ""
+                    else:
+                        current_part += char
+                # Add the last part
+                if current_part:
+                    parts.append(current_part.strip().strip('"'))
+                if len(parts) >= 6 and parts[0].strip():
+                    mcq_data.append(parts[:6])
+    return mcq_data
+def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
+    """Main function to process PDF and generate MCQs"""
     if not api_key:
+        return "❌ Please provide your Gemini API key", ""
+    if not pdf_file:
+        return "❌ Please upload a PDF file", ""
     try:
+        # Extract text from PDF
+        progress(0.1, desc="Extracting text from PDF...")
+        extracted_text = extract_text_from_pdf(pdf_file)
+        if extracted_text.startswith("Error"):
+            return extracted_text, ""
+        # Chunk the text
+        progress(0.2, desc="Chunking text...")
+        chunks = chunk_text(extracted_text, chunk_size)
+        if not chunks:
+            return "❌ No text could be extracted from the PDF", ""
+        # Generate MCQs from each chunk
+        all_mcq_data = []
+        total_chunks = len(chunks)
+        for i, chunk in enumerate(chunks):
+            progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
+            chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
+            all_mcq_data.extend(chunk_mcqs)
+            # Add small delay to avoid rate limiting
+            time.sleep(1)
+        progress(0.95, desc="Creating Excel file...")
+        if not all_mcq_data:
+            return "❌ No MCQs could be generated from the PDF content", ""
+        # Create DataFrame
+        df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
+        # Create Excel file in memory
+        output = io.BytesIO()
+        with pd.ExcelWriter(output, engine='openpyxl') as writer:
+            df.to_excel(writer, index=False, sheet_name='MCQs')
+        output.seek(0)
+        # Save to temporary file for download
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
+        temp_file.write(output.getvalue())
+        temp_file.close()
+        progress(1.0, desc="Complete!")
+        success_message = f"✅ Successfully generated {len(all_mcq_data)} MCQs from {total_chunks} text chunks!"
+        return success_message, temp_file.name
     except Exception as e:
+        return f"❌ Error processing PDF: {str(e)}", ""
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 📚 PDF to MCQ Generator
+            Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.
+            ## How to use:
+            1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
+            2. Enter your API key below
+            3. Upload your PDF file
+            4. Adjust chunk size if needed (larger = fewer API calls, smaller = more focused questions)
+            5. Click "Generate MCQs" and wait for processing
+            6. Download the generated Excel file with your MCQs
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=2):
+                api_key_input = gr.Textbox(
+                    label="🔑 Gemini API Key",
+                    placeholder="Enter your Gemini API key here...",
+                    type="password"
+                )
+                pdf_input = gr.File(
+                    label="📄 Upload PDF File",
+                    file_types=[".pdf"]
+                )
+                chunk_size_input = gr.Slider(
+                    minimum=500,
+                    maximum=3000,
+                    value=1500,
+                    step=100,
+                    label="📝 Chunk Size (words per processing batch)"
+                )
+                generate_btn = gr.Button(
+                    "🚀 Generate MCQs",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                status_output = gr.Textbox(
+                    label="📊 Status",
+                    placeholder="Status updates will appear here...",
+                    lines=10
+                )
+                download_file = gr.File(
+                    label="⬇️ Download MCQs Excel File",
+                    visible=False
+                )
+        # Event handlers
+        generate_btn.click(
+            fn=process_pdf_to_mcqs,
+            inputs=[pdf_input, api_key_input, chunk_size_input],
+            outputs=[status_output, download_file],
+            show_progress=True
+        ).then(
+            fn=lambda x: gr.update(visible=bool(x)),
+            inputs=[download_file],
+            outputs=[download_file]
+        )
+        gr.Markdown(
+            """
+            ## 📋 Features:
+            - **OCR Text Extraction**: Converts PDF pages to images and extracts text
+            - **Smart Chunking**: Breaks large documents into manageable pieces
+            - **Multiple AI Models**: Automatically tries different Gemini models for best results
+            - **Excel Output**: Download MCQs in a formatted Excel file
+            - **Progress Tracking**: Real-time updates on processing status
+            ## ⚠️ Notes:
+            - Processing time depends on PDF length and complexity
+            - Large PDFs are processed in chunks to avoid timeouts
+            - Make sure your PDF contains readable text (not just images)
+            - API key is not stored and only used for your session
+            """
+        )
+    return demo
+# Launch the app
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860)