Spaces:

Ali-Raza-167
/

pdf-to-mcqs

Sleeping

App Files Files Community

Ali-Raza-167 commited on Sep 22, 2025

Commit

fc39378

verified ·

1 Parent(s): f81ae08

Create app.py

Browse files

Files changed (1) hide show

app.py +219 -0

app.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import gradio as gr
+import pandas as pd
+from pdf2image import convert_from_path
+import pytesseract
+import google.generativeai as genai
+import tempfile
+import os
+import re
+import time
+from io import BytesIO
+def batch_statements(statements, batch_size=5):
+    """Split statements into batches"""
+    for i in range(0, len(statements), batch_size):
+        yield statements[i:i+batch_size]
+def extract_text_from_pdf(pdf_file, api_key, progress_callback=None):
+    """Extract text from PDF and generate MCQs"""
+    if not api_key.strip():
+        return None, "Please enter your Google API key"
+    try:
+        # Configure Gemini API
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(pdf_file)
+            tmp_path = tmp_file.name
+        if progress_callback:
+            progress_callback(0.1, "Converting PDF to images...")
+        # Convert PDF to images and extract text
+        pages = convert_from_path(tmp_path)
+        page_texts = []
+        for i, page in enumerate(pages):
+            if progress_callback:
+                progress_callback(0.1 + (i / len(pages)) * 0.3, f"Processing page {i+1}/{len(pages)}...")
+            text = pytesseract.image_to_string(page)
+            page_texts.append(text)
+        # Clean up temp file
+        os.unlink(tmp_path)
+        if progress_callback:
+            progress_callback(0.4, "Splitting text into statements...")
+        # Split into statements
+        all_statements = []
+        for page_text in page_texts:
+            statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()]
+            all_statements.extend(statements)
+        if not all_statements:
+            return None, "No text could be extracted from the PDF"
+        if progress_callback:
+            progress_callback(0.5, f"Found {len(all_statements)} statements. Creating batches...")
+        # Create batches
+        batches = list(batch_statements(all_statements, batch_size=5))
+        if progress_callback:
+            progress_callback(0.6, f"Generating MCQs from {len(batches)} batches...")
+        # Generate MCQs
+        mcq_data = []
+        for i, batch in enumerate(batches):
+            if progress_callback:
+                progress_callback(0.6 + (i / len(batches)) * 0.3, f"Processing batch {i+1}/{len(batches)}...")
+            text_block = ". ".join(batch)
+            prompt = f"""
+Generate exactly 5 multiple choice questions from the following text.
+Each question must have:
+- A clear question
+- 4 options labeled A, B, C, D
+- One correct answer (only the letter A, B, C, or D)
+Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
+Text: {text_block}
+"""
+            try:
+                response = model.generate_content(prompt)
+                output = response.text.strip()
+                # Parse CSV output
+                for line in output.splitlines():
+                    if line.strip() and ',' in line:
+                        parts = [part.strip().strip('"') for part in line.split(',')]
+                        if len(parts) == 6 and parts[5] in ['A', 'B', 'C', 'D']:
+                            mcq_data.append(parts)
+            except Exception as e:
+                print(f"Error generating MCQ for batch {i+1}: {str(e)}")
+                continue
+            # Small delay to avoid rate limiting
+            time.sleep(0.1)
+        if progress_callback:
+            progress_callback(0.95, "Creating Excel file...")
+        if not mcq_data:
+            return None, "No MCQs could be generated from the text"
+        # Create DataFrame and Excel file
+        df = pd.DataFrame(mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
+        # Save to BytesIO buffer
+        excel_buffer = BytesIO()
+        df.to_excel(excel_buffer, index=False, engine='openpyxl')
+        excel_buffer.seek(0)
+        if progress_callback:
+            progress_callback(1.0, f"Complete! Generated {len(mcq_data)} MCQs")
+        return excel_buffer.getvalue(), f"Successfully generated {len(mcq_data)} MCQs from {len(pages)} pages"
+    except Exception as e:
+        return None, f"Error processing PDF: {str(e)}"
+def process_pdf_with_progress(pdf_file, api_key, progress=gr.Progress()):
+    """Wrapper function for Gradio progress tracking"""
+    def progress_callback(value, desc):
+        progress(value, desc=desc)
+    return extract_text_from_pdf(pdf_file, api_key, progress_callback)
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as app:
+        gr.Markdown(
+            """
+            # 📚 PDF to MCQ Generator
+            Upload a PDF file and generate multiple choice questions automatically using Google's Gemini AI.
+            **Instructions:**
+            1. Get your free Google AI API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
+            2. Enter your API key below
+            3. Upload your PDF file
+            4. Click "Generate MCQs" and wait for processing
+            5. Download the generated Excel file with MCQs
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                api_key_input = gr.Textbox(
+                    label="Google AI API Key",
+                    placeholder="Enter your Google AI API key here...",
+                    type="password",
+                    info="Get your free API key from Google AI Studio"
+                )
+                pdf_input = gr.File(
+                    label="Upload PDF",
+                    file_types=[".pdf"],
+                    info="Upload the PDF file you want to convert to MCQs"
+                )
+                generate_btn = gr.Button(
+                    "🚀 Generate MCQs",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column():
+                status_output = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    info="Processing status will appear here"
+                )
+                download_file = gr.File(
+                    label="Download MCQs Excel File",
+                    interactive=False
+                )
+        gr.Markdown(
+            """
+            ### Features:
+            - 🤖 Powered by Google's Gemini AI
+            - 📄 Extracts text from PDF using OCR
+            - ❓ Generates 5 MCQs per text batch
+            - 📊 Outputs organized Excel file
+            - 🔄 Progress tracking during processing
+            ### Tips for better results:
+            - Use PDFs with clear, readable text
+            - Ensure good image quality for OCR
+            - Educational content works best for MCQ generation
+            """
+        )
+        # Event handler
+        generate_btn.click(
+            fn=process_pdf_with_progress,
+            inputs=[pdf_input, api_key_input],
+            outputs=[download_file, status_output],
+            show_progress=True
+        )
+    return app
+# Launch the app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )