Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import pandas as pd | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| import google.generativeai as genai | |
| import re | |
| import tempfile | |
| from typing import List, Tuple | |
| class PDFToMCQGenerator: | |
| def __init__(self): | |
| self.model = None | |
| self.configured = False | |
| def configure_gemini(self, api_key: str): | |
| """Configure Gemini with API key""" | |
| try: | |
| genai.configure(api_key=api_key) | |
| self.model = genai.GenerativeModel('gemini-pro') | |
| self.configured = True | |
| return "β Gemini configured successfully!" | |
| except Exception as e: | |
| return f"β Error configuring Gemini: {str(e)}" | |
| def extract_text_from_pdf(self, pdf_path: str) -> List[str]: | |
| """Extract text from PDF using OCR""" | |
| try: | |
| pages = convert_from_path(pdf_path) | |
| page_texts = [] | |
| for page in pages: | |
| text = pytesseract.image_to_string(page) | |
| page_texts.append(text) | |
| return page_texts | |
| except Exception as e: | |
| raise Exception(f"Error extracting text from PDF: {str(e)}") | |
| def split_into_statements(self, page_texts: List[str]) -> List[str]: | |
| """Split text into individual statements""" | |
| all_statements = [] | |
| for page_text in page_texts: | |
| statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()] | |
| all_statements.extend(statements) | |
| return all_statements | |
| def batch_statements(self, statements: List[str], batch_size: int = 5) -> List[List[str]]: | |
| """Batch statements into groups""" | |
| return [statements[i:i + batch_size] for i in range(0, len(statements), batch_size)] | |
| def generate_mcqs(self, text_block: str) -> List[List[str]]: | |
| """Generate MCQs from text using Gemini""" | |
| if not self.configured: | |
| raise Exception("Gemini not configured. Please provide API key first.") | |
| prompt = f""" | |
| Generate exactly 5 MCQs from the following statements. | |
| Each question must have: | |
| - Clear, concise Question | |
| - 4 Options (A-D) with only one correct answer | |
| - Correct Answer (ONLY the letter A, B, C, or D β no text) | |
| Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer | |
| Text: | |
| {text_block} | |
| Example format: | |
| "What is the capital of France?","Paris","London","Berlin","Madrid","A" | |
| "Which planet is known as the Red Planet?","Earth","Mars","Jupiter","Venus","B" | |
| Important: Return ONLY the CSV data, no additional text. | |
| """ | |
| try: | |
| response = self.model.generate_content(prompt) | |
| mcq_data = [] | |
| for line in response.text.strip().split('\n'): | |
| if line.strip() and not line.startswith('```'): | |
| parts = line.split(',') | |
| if len(parts) == 6: | |
| # Clean each part | |
| cleaned_parts = [part.strip().strip('"') for part in parts] | |
| mcq_data.append(cleaned_parts) | |
| return mcq_data | |
| except Exception as e: | |
| raise Exception(f"Error generating MCQs: {str(e)}") | |
| def process_pdf(self, pdf_file, api_key: str, batch_size: int = 5) -> Tuple[pd.DataFrame, str]: | |
| """Main processing function""" | |
| # Configure Gemini | |
| config_status = self.configure_gemini(api_key) | |
| if not self.configured: | |
| return None, config_status | |
| try: | |
| # Save uploaded file temporarily | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: | |
| tmp_file.write(pdf_file) | |
| pdf_path = tmp_file.name | |
| # Extract text | |
| page_texts = self.extract_text_from_pdf(pdf_path) | |
| statements = self.split_into_statements(page_texts) | |
| if len(statements) == 0: | |
| return None, "β No text could be extracted from the PDF. Please check if the PDF contains readable text." | |
| # Batch statements | |
| batches = self.batch_statements(statements, batch_size) | |
| # Generate MCQs | |
| all_mcqs = [] | |
| successful_batches = 0 | |
| for i, batch in enumerate(batches, 1): | |
| try: | |
| text_block = ". ".join(batch) | |
| mcqs = self.generate_mcqs(text_block) | |
| all_mcqs.extend(mcqs) | |
| successful_batches += 1 | |
| except Exception as e: | |
| print(f"Batch {i} failed: {str(e)}") | |
| continue | |
| # Clean up temporary file | |
| os.unlink(pdf_path) | |
| if len(all_mcqs) == 0: | |
| return None, "β No MCQs could be generated. Please check your PDF content and try again." | |
| # Create DataFrame | |
| df = pd.DataFrame(all_mcqs, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer']) | |
| status_msg = f"β Successfully processed {successful_batches} batches and generated {len(all_mcqs)} MCQs!" | |
| return df, status_msg | |
| except Exception as e: | |
| # Clean up temporary file if it exists | |
| if 'pdf_path' in locals(): | |
| try: | |
| os.unlink(pdf_path) | |
| except: | |
| pass | |
| return None, f"β Error processing PDF: {str(e)}" | |
| # Create generator instance | |
| generator = PDFToMCQGenerator() | |
| def process_pdf_interface(pdf_file, api_key, batch_size=5): | |
| """Gradio interface function""" | |
| if pdf_file is None: | |
| return None, "β Please upload a PDF file." | |
| if not api_key.strip(): | |
| return None, "β Please enter your Gemini API key." | |
| try: | |
| batch_size = int(batch_size) | |
| if batch_size < 1 or batch_size > 10: | |
| return None, "β Batch size must be between 1 and 10." | |
| except ValueError: | |
| return None, "β Batch size must be a number." | |
| df, status = generator.process_pdf(pdf_file, api_key, batch_size) | |
| if df is not None: | |
| # Return both the DataFrame and status message | |
| return df, status | |
| else: | |
| return None, status | |
| # Create Gradio interface | |
| with gr.Blocks(title="PPSC PDF to MCQ Generator", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π PPSC PDF to MCQ Generator") | |
| gr.Markdown("Convert PDF content into multiple-choice questions using Google Gemini") | |
| with gr.Row(): | |
| with gr.Column(): | |
| api_key = gr.Textbox( | |
| label="Gemini API Key", | |
| type="password", | |
| placeholder="Enter your Google Gemini API key...", | |
| info="Get your API key from: https://aistudio.google.com/app/apikey" | |
| ) | |
| pdf_file = gr.File( | |
| label="Upload PDF File", | |
| file_types=[".pdf"], | |
| type="binary" | |
| ) | |
| batch_size = gr.Number( | |
| label="Batch Size", | |
| value=5, | |
| minimum=1, | |
| maximum=10, | |
| step=1, | |
| info="Number of statements to process together (1-10)" | |
| ) | |
| process_btn = gr.Button("Generate MCQs", variant="primary") | |
| with gr.Column(): | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| mcq_output = gr.Dataframe( | |
| label="Generated MCQs", | |
| headers=["Question", "Option A", "Option B", "Option C", "Option D", "Correct Answer"], | |
| wrap=True, | |
| # height=400 | |
| ) | |
| # Process button click | |
| process_btn.click( | |
| fn=process_pdf_interface, | |
| inputs=[pdf_file, api_key, batch_size], | |
| outputs=[mcq_output, status_output] | |
| ) | |
| # Add download functionality | |
| def render_download_button(df): | |
| if df is not None and not df.empty: | |
| with gr.Row(): | |
| # Create a temporary file for download | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_file: | |
| df.to_excel(tmp_file.name, index=False) | |
| download_btn = gr.DownloadButton( | |
| "π₯ Download as Excel", | |
| value=tmp_file.name, | |
| file_name="generated_mcqs.xlsx" | |
| ) | |
| # For Hugging Face deployment | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0" if os.getenv("SPACE_ID") else None, | |
| share=False | |
| ) |