import gradio as gr
import os
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
import google.generativeai as genai
import re
import tempfile
from typing import List, Tuple

class PDFToMCQGenerator:
    def __init__(self):
        self.model = None
        self.configured = False
    
    def configure_gemini(self, api_key: str):
        """Configure Gemini with API key"""
        try:
            genai.configure(api_key=api_key)
            self.model = genai.GenerativeModel('gemini-pro')
            self.configured = True
            return "✅ Gemini configured successfully!"
        except Exception as e:
            return f"❌ Error configuring Gemini: {str(e)}"
    
    def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF using OCR"""
        try:
            pages = convert_from_path(pdf_path)
            page_texts = []
            for page in pages:
                text = pytesseract.image_to_string(page)
                page_texts.append(text)
            return page_texts
        except Exception as e:
            raise Exception(f"Error extracting text from PDF: {str(e)}")
    
    def split_into_statements(self, page_texts: List[str]) -> List[str]:
        """Split text into individual statements"""
        all_statements = []
        for page_text in page_texts:
            statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()]
            all_statements.extend(statements)
        return all_statements
    
    def batch_statements(self, statements: List[str], batch_size: int = 5) -> List[List[str]]:
        """Batch statements into groups"""
        return [statements[i:i + batch_size] for i in range(0, len(statements), batch_size)]
    
    def generate_mcqs(self, text_block: str) -> List[List[str]]:
        """Generate MCQs from text using Gemini"""
        if not self.configured:
            raise Exception("Gemini not configured. Please provide API key first.")
        
        prompt = f"""
        Generate exactly 5 MCQs from the following statements.
        Each question must have:
        - Clear, concise Question
        - 4 Options (A-D) with only one correct answer
        - Correct Answer (ONLY the letter A, B, C, or D — no text)
        
        Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
        
        Text:
        {text_block}
        
        Example format:
        "What is the capital of France?","Paris","London","Berlin","Madrid","A"
        "Which planet is known as the Red Planet?","Earth","Mars","Jupiter","Venus","B"
        
        Important: Return ONLY the CSV data, no additional text.
        """
        
        try:
            response = self.model.generate_content(prompt)
            mcq_data = []
            
            for line in response.text.strip().split('\n'):
                if line.strip() and not line.startswith('```'):
                    parts = line.split(',')
                    if len(parts) == 6:
                        # Clean each part
                        cleaned_parts = [part.strip().strip('"') for part in parts]
                        mcq_data.append(cleaned_parts)
            
            return mcq_data
        except Exception as e:
            raise Exception(f"Error generating MCQs: {str(e)}")
    
    def process_pdf(self, pdf_file, api_key: str, batch_size: int = 5) -> Tuple[pd.DataFrame, str]:
        """Main processing function"""
        # Configure Gemini
        config_status = self.configure_gemini(api_key)
        if not self.configured:
            return None, config_status
        
        try:
            # Save uploaded file temporarily
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                tmp_file.write(pdf_file)
                pdf_path = tmp_file.name
            
            # Extract text
            page_texts = self.extract_text_from_pdf(pdf_path)
            statements = self.split_into_statements(page_texts)
            
            if len(statements) == 0:
                return None, "❌ No text could be extracted from the PDF. Please check if the PDF contains readable text."
            
            # Batch statements
            batches = self.batch_statements(statements, batch_size)
            
            # Generate MCQs
            all_mcqs = []
            successful_batches = 0
            
            for i, batch in enumerate(batches, 1):
                try:
                    text_block = ". ".join(batch)
                    mcqs = self.generate_mcqs(text_block)
                    all_mcqs.extend(mcqs)
                    successful_batches += 1
                except Exception as e:
                    print(f"Batch {i} failed: {str(e)}")
                    continue
            
            # Clean up temporary file
            os.unlink(pdf_path)
            
            if len(all_mcqs) == 0:
                return None, "❌ No MCQs could be generated. Please check your PDF content and try again."
            
            # Create DataFrame
            df = pd.DataFrame(all_mcqs, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
            
            status_msg = f"✅ Successfully processed {successful_batches} batches and generated {len(all_mcqs)} MCQs!"
            return df, status_msg
            
        except Exception as e:
            # Clean up temporary file if it exists
            if 'pdf_path' in locals():
                try:
                    os.unlink(pdf_path)
                except:
                    pass
            return None, f"❌ Error processing PDF: {str(e)}"

# Create generator instance
generator = PDFToMCQGenerator()

def process_pdf_interface(pdf_file, api_key, batch_size=5):
    """Gradio interface function"""
    if pdf_file is None:
        return None, "❌ Please upload a PDF file."
    
    if not api_key.strip():
        return None, "❌ Please enter your Gemini API key."
    
    try:
        batch_size = int(batch_size)
        if batch_size < 1 or batch_size > 10:
            return None, "❌ Batch size must be between 1 and 10."
    except ValueError:
        return None, "❌ Batch size must be a number."
    
    df, status = generator.process_pdf(pdf_file, api_key, batch_size)
    
    if df is not None:
        # Return both the DataFrame and status message
        return df, status
    else:
        return None, status

# Create Gradio interface
with gr.Blocks(title="PPSC PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 PPSC PDF to MCQ Generator")
    gr.Markdown("Convert PDF content into multiple-choice questions using Google Gemini")
    
    with gr.Row():
        with gr.Column():
            api_key = gr.Textbox(
                label="Gemini API Key",
                type="password",
                placeholder="Enter your Google Gemini API key...",
                info="Get your API key from: https://aistudio.google.com/app/apikey"
            )
            
            pdf_file = gr.File(
                label="Upload PDF File",
                file_types=[".pdf"],
                type="binary"
            )
            
            batch_size = gr.Number(
                label="Batch Size",
                value=5,
                minimum=1,
                maximum=10,
                step=1,
                info="Number of statements to process together (1-10)"
            )
            
            process_btn = gr.Button("Generate MCQs", variant="primary")
        
        with gr.Column():
            status_output = gr.Textbox(
                label="Status",
                interactive=False,
                lines=3
            )
            
            mcq_output = gr.Dataframe(
                label="Generated MCQs",
                headers=["Question", "Option A", "Option B", "Option C", "Option D", "Correct Answer"],
                wrap=True,
                # height=400
            )
    
    # Process button click
    process_btn.click(
        fn=process_pdf_interface,
        inputs=[pdf_file, api_key, batch_size],
        outputs=[mcq_output, status_output]
    )
    
    # Add download functionality
    @gr.render(inputs=mcq_output)
    def render_download_button(df):
        if df is not None and not df.empty:
            with gr.Row():
                # Create a temporary file for download
                with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_file:
                    df.to_excel(tmp_file.name, index=False)
                    download_btn = gr.DownloadButton(
                        "📥 Download as Excel",
                        value=tmp_file.name,
                        file_name="generated_mcqs.xlsx"
                    )

# For Hugging Face deployment
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0" if os.getenv("SPACE_ID") else None,
        share=False
    )