import gradio as gr import os import pandas as pd from pdf2image import convert_from_path import pytesseract import google.generativeai as genai import re import tempfile from typing import List, Tuple class PDFToMCQGenerator: def __init__(self): self.model = None self.configured = False def configure_gemini(self, api_key: str): """Configure Gemini with API key""" try: genai.configure(api_key=api_key) self.model = genai.GenerativeModel('gemini-pro') self.configured = True return "✅ Gemini configured successfully!" except Exception as e: return f"❌ Error configuring Gemini: {str(e)}" def extract_text_from_pdf(self, pdf_path: str) -> List[str]: """Extract text from PDF using OCR""" try: pages = convert_from_path(pdf_path) page_texts = [] for page in pages: text = pytesseract.image_to_string(page) page_texts.append(text) return page_texts except Exception as e: raise Exception(f"Error extracting text from PDF: {str(e)}") def split_into_statements(self, page_texts: List[str]) -> List[str]: """Split text into individual statements""" all_statements = [] for page_text in page_texts: statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()] all_statements.extend(statements) return all_statements def batch_statements(self, statements: List[str], batch_size: int = 5) -> List[List[str]]: """Batch statements into groups""" return [statements[i:i + batch_size] for i in range(0, len(statements), batch_size)] def generate_mcqs(self, text_block: str) -> List[List[str]]: """Generate MCQs from text using Gemini""" if not self.configured: raise Exception("Gemini not configured. Please provide API key first.") prompt = f""" Generate exactly 5 MCQs from the following statements. Each question must have: - Clear, concise Question - 4 Options (A-D) with only one correct answer - Correct Answer (ONLY the letter A, B, C, or D — no text) Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer Text: {text_block} Example format: "What is the capital of France?","Paris","London","Berlin","Madrid","A" "Which planet is known as the Red Planet?","Earth","Mars","Jupiter","Venus","B" Important: Return ONLY the CSV data, no additional text. """ try: response = self.model.generate_content(prompt) mcq_data = [] for line in response.text.strip().split('\n'): if line.strip() and not line.startswith('```'): parts = line.split(',') if len(parts) == 6: # Clean each part cleaned_parts = [part.strip().strip('"') for part in parts] mcq_data.append(cleaned_parts) return mcq_data except Exception as e: raise Exception(f"Error generating MCQs: {str(e)}") def process_pdf(self, pdf_file, api_key: str, batch_size: int = 5) -> Tuple[pd.DataFrame, str]: """Main processing function""" # Configure Gemini config_status = self.configure_gemini(api_key) if not self.configured: return None, config_status try: # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: tmp_file.write(pdf_file) pdf_path = tmp_file.name # Extract text page_texts = self.extract_text_from_pdf(pdf_path) statements = self.split_into_statements(page_texts) if len(statements) == 0: return None, "❌ No text could be extracted from the PDF. Please check if the PDF contains readable text." # Batch statements batches = self.batch_statements(statements, batch_size) # Generate MCQs all_mcqs = [] successful_batches = 0 for i, batch in enumerate(batches, 1): try: text_block = ". ".join(batch) mcqs = self.generate_mcqs(text_block) all_mcqs.extend(mcqs) successful_batches += 1 except Exception as e: print(f"Batch {i} failed: {str(e)}") continue # Clean up temporary file os.unlink(pdf_path) if len(all_mcqs) == 0: return None, "❌ No MCQs could be generated. Please check your PDF content and try again." # Create DataFrame df = pd.DataFrame(all_mcqs, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer']) status_msg = f"✅ Successfully processed {successful_batches} batches and generated {len(all_mcqs)} MCQs!" return df, status_msg except Exception as e: # Clean up temporary file if it exists if 'pdf_path' in locals(): try: os.unlink(pdf_path) except: pass return None, f"❌ Error processing PDF: {str(e)}" # Create generator instance generator = PDFToMCQGenerator() def process_pdf_interface(pdf_file, api_key, batch_size=5): """Gradio interface function""" if pdf_file is None: return None, "❌ Please upload a PDF file." if not api_key.strip(): return None, "❌ Please enter your Gemini API key." try: batch_size = int(batch_size) if batch_size < 1 or batch_size > 10: return None, "❌ Batch size must be between 1 and 10." except ValueError: return None, "❌ Batch size must be a number." df, status = generator.process_pdf(pdf_file, api_key, batch_size) if df is not None: # Return both the DataFrame and status message return df, status else: return None, status # Create Gradio interface with gr.Blocks(title="PPSC PDF to MCQ Generator", theme=gr.themes.Soft()) as demo: gr.Markdown("# 📚 PPSC PDF to MCQ Generator") gr.Markdown("Convert PDF content into multiple-choice questions using Google Gemini") with gr.Row(): with gr.Column(): api_key = gr.Textbox( label="Gemini API Key", type="password", placeholder="Enter your Google Gemini API key...", info="Get your API key from: https://aistudio.google.com/app/apikey" ) pdf_file = gr.File( label="Upload PDF File", file_types=[".pdf"], type="binary" ) batch_size = gr.Number( label="Batch Size", value=5, minimum=1, maximum=10, step=1, info="Number of statements to process together (1-10)" ) process_btn = gr.Button("Generate MCQs", variant="primary") with gr.Column(): status_output = gr.Textbox( label="Status", interactive=False, lines=3 ) mcq_output = gr.Dataframe( label="Generated MCQs", headers=["Question", "Option A", "Option B", "Option C", "Option D", "Correct Answer"], wrap=True, # height=400 ) # Process button click process_btn.click( fn=process_pdf_interface, inputs=[pdf_file, api_key, batch_size], outputs=[mcq_output, status_output] ) # Add download functionality @gr.render(inputs=mcq_output) def render_download_button(df): if df is not None and not df.empty: with gr.Row(): # Create a temporary file for download with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_file: df.to_excel(tmp_file.name, index=False) download_btn = gr.DownloadButton( "📥 Download as Excel", value=tmp_file.name, file_name="generated_mcqs.xlsx" ) # For Hugging Face deployment if __name__ == "__main__": demo.launch( server_name="0.0.0.0" if os.getenv("SPACE_ID") else None, share=False )