File size: 13,685 Bytes
95fb4d2
c0e8eab
 
 
bce1ee4
95fb4d2
 
 
 
 
238717d
95fb4d2
 
 
 
 
 
c0e8eab
65956db
95fb4d2
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
 
c0e8eab
54b1661
95fb4d2
c0e8eab
95fb4d2
c0e8eab
95fb4d2
 
c0e8eab
54b1661
95fb4d2
41c3d7f
 
 
 
95fb4d2
 
 
 
 
 
 
54b1661
95fb4d2
 
 
 
 
238717d
 
95fb4d2
238717d
 
95fb4d2
 
238717d
 
 
 
95fb4d2
 
 
 
 
 
c0e8eab
95fb4d2
c0e8eab
95fb4d2
 
 
 
41c3d7f
95fb4d2
 
 
 
41c3d7f
95fb4d2
 
41c3d7f
95fb4d2
 
 
 
41c3d7f
 
 
 
238717d
41c3d7f
238717d
41c3d7f
 
 
238717d
 
41c3d7f
238717d
 
 
 
41c3d7f
238717d
 
 
 
 
 
 
41c3d7f
95fb4d2
238717d
 
 
 
 
 
 
41c3d7f
 
 
 
 
238717d
41c3d7f
 
238717d
 
 
 
 
 
41c3d7f
 
 
 
 
95fb4d2
41c3d7f
95fb4d2
c0e8eab
54b1661
95fb4d2
c0e8eab
65956db
95fb4d2
 
65956db
95fb4d2
c0e8eab
95fb4d2
 
65956db
95fb4d2
 
65956db
95fb4d2
 
 
 
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
54b1661
95fb4d2
 
54b1661
 
95fb4d2
 
 
 
65956db
95fb4d2
238717d
 
 
 
 
 
 
 
 
95fb4d2
238717d
95fb4d2
65956db
 
 
95fb4d2
65956db
 
 
95fb4d2
 
 
54b1661
95fb4d2
 
 
c0e8eab
65956db
c0e8eab
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
 
 
 
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
95fb4d2
54b1661
95fb4d2
 
 
 
54b1661
 
 
 
 
 
 
 
95fb4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b1661
95fb4d2
 
 
65956db
95fb4d2
 
 
 
 
 
 
 
54b1661
 
95fb4d2
 
 
 
 
54b1661
 
95fb4d2
 
 
54b1661
95fb4d2
 
 
 
c0e8eab
95fb4d2
c0e8eab
95fb4d2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import os
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
import tempfile
import io
import gradio as gr
import google.generativeai as genai
from typing import List, Tuple
import time
import csv

# Configure Gemini API
def configure_gemini_api(api_key: str):
    """Configure the Gemini API with the provided key"""
    genai.configure(api_key=api_key)
    return "βœ… API Key configured successfully!"

def extract_text_from_pdf(pdf_file_path: str) -> str:
    """Extract text from PDF using OCR"""
    try:
        # Convert PDF to images
        pages = convert_from_path(pdf_file_path)
        all_text = ""
        
        for i, page in enumerate(pages):
            text = pytesseract.image_to_string(page)
            all_text += text + "\n"
        
        return all_text
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def chunk_text(text: str, chunk_size: int = 500) -> List[str]:  # Changed default to 500 for more chunks/MCQs
    """Split text into chunks for processing"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]:  # Added mcqs_per_chunk param, default 20
    """Generate MCQs from a text chunk using Gemini API"""
    print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
    print(f"Chunk length: {len(chunk)} characters")
    print(f"Chunk preview: {chunk[:200]}...")
    
    models_to_try = [
        'gemini-2.0-flash-exp',
        'gemini-1.5-flash',
        'gemini-1.5-pro'
    ]
    
    prompt = f"""
    Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
    Each question must have:
    - A clear, specific question
    - 4 options labeled A, B, C, D
    - One correct answer (A, B, C, or D)
    
    IMPORTANT: Do NOT include any headers or column names in your response.
    Format each question as: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
    
    Rules:
    - Start directly with the first question, no headers
    - Use commas only as field separators
    - If any field contains a comma, wrap it in double quotes
    - Each question should be on a new line
    - Make questions specific and clear based on the text content
    - Ensure all 4 options are plausible but only one is correct
    - The correct answer should be A, B, C, or D only
    
    Text to analyze:
    {chunk}
    """
    
    # Configure API
    genai.configure(api_key=api_key)
    
    mcq_data = []
    response = None
    
    for model_name in models_to_try:
        try:
            print(f"Trying model: {model_name}")
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)
            
            if response.text:
                print(f"βœ… Successfully used model: {model_name}")
                break
        except Exception as e:
            print(f"❌ Error with {model_name}: {e}")
            continue
    
    if response and response.text:
        output = response.text.strip()
        print(f"\n--- RAW AI RESPONSE FOR CHUNK {chunk_number} ---")
        print(output)
        print("--- END RAW RESPONSE ---\n")
        
        lines = [line.strip() for line in output.splitlines() if line.strip()]
        print(f"Total non-empty lines in response: {len(lines)}")
        
        for idx, line in enumerate(lines):
            print(f"Processing line {idx + 1}: {line[:100]}...")
            
            # Skip any header lines that might still appear
            if ('Question' in line and 'OptionA' in line and 'OptionB' in line) or line.startswith('Question,'):
                print(f"❌ Skipped header line: {line[:50]}...")
                continue
            
            # Skip empty lines or lines that don't look like MCQs
            if not line or line.count(',') < 5:
                print(f"❌ Skipped invalid line (comma count: {line.count(',')}): {line[:50]}...")
                continue
            
            # Parse CSV line using proper CSV parsing
            try:
                # Use StringIO to parse the line as CSV
                csv_reader = csv.reader([line])
                parts = next(csv_reader)
                print(f"Parsed parts: {len(parts)} fields")
                
                # Ensure we have exactly 6 parts and the question is not empty
                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
                    # Clean up each part
                    cleaned_parts = [part.strip() for part in parts[:6]]
                    # Validate that correct answer is A, B, C, or D
                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
                        mcq_data.append(cleaned_parts)
                        print(f"βœ… Added MCQ: {cleaned_parts[0][:50]}... (Answer: {cleaned_parts[5]})")
                    else:
                        print(f"❌ Invalid answer format: {cleaned_parts[5]}")
                else:
                    print(f"❌ Invalid parts count or empty question. Parts: {len(parts)}, First part: '{parts[0] if parts else 'N/A'}'")
                        
            except csv.Error as e:
                print(f"❌ CSV parsing error: {e}")
                # Fallback to simple split if CSV parsing fails
                parts = line.split(',')
                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
                    cleaned_parts = [part.strip().strip('"') for part in parts[:6]]
                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
                        mcq_data.append(cleaned_parts)
                        print(f"βœ… Added MCQ (fallback): {cleaned_parts[0][:50]}...")
                    else:
                        print(f"❌ Invalid answer format (fallback): {cleaned_parts[5]}")
    else:
        print(f"❌ No response received for chunk {chunk_number}")
    
    print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
    return mcq_data

def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]:  # Added mcqs_per_chunk param, default 20
    """Main function to process PDF and generate MCQs"""
    if not api_key:
        return "❌ Please provide your Gemini API key", None
    
    if not pdf_file:
        return "❌ Please upload a PDF file", None
    
    try:
        # Extract text from PDF
        progress(0.1, desc="Extracting text from PDF...")
        extracted_text = extract_text_from_pdf(pdf_file.name)
        
        if extracted_text.startswith("Error"):
            return extracted_text, None
        
        # Chunk the text
        progress(0.2, desc="Chunking text...")
        chunks = chunk_text(extracted_text, chunk_size)
        
        if not chunks:
            return "❌ No text could be extracted from the PDF", None
        
        # Generate MCQs from each chunk
        all_mcq_data = []
        total_chunks = len(chunks)
        
        for i, chunk in enumerate(chunks):
            progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
            
            chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
            all_mcq_data.extend(chunk_mcqs)
            
            # Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
            time.sleep(0.5)
        
        progress(0.95, desc="Creating Excel file...")
        
        if not all_mcq_data:
            return "❌ No MCQs could be generated from the PDF content", None
        
        # Remove any duplicate questions
        seen_questions = set()
        unique_mcq_data = []
        for mcq in all_mcq_data:
            question_text = mcq[0].lower().strip()
            if question_text not in seen_questions:
                seen_questions.add(question_text)
                unique_mcq_data.append(mcq)
        
        # Create DataFrame
        df = pd.DataFrame(unique_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
        
        # Create temporary Excel file for download
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
        temp_file.close()  # Close to allow pandas to write to it
        
        # Write Excel file
        with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='MCQs')
        
        progress(1.0, desc="Complete!")
        
        success_message = f"βœ… Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"
        
        return success_message, temp_file.name
        
    except Exception as e:
        return f"❌ Error processing PDF: {str(e)}", None

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # πŸ“š PDF to MCQ Generator
            
            Upload a PDF document and generate multiple choice questions automatically using Google's Gemini AI.
            
            ## How to use:
            1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
            2. Enter your API key below
            3. Upload your PDF file
            4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
            5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
            6. Click "Generate MCQs" and wait for processing
            7. Download the generated Excel file with your MCQs
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                api_key_input = gr.Textbox(
                    label="πŸ”‘ Gemini API Key",
                    placeholder="Enter your Gemini API key here...",
                    type="password"
                )
                
                pdf_input = gr.File(
                    label="πŸ“„ Upload PDF File",
                    file_types=[".pdf"]
                )
                
                chunk_size_input = gr.Slider(
                    minimum=300,  # Lowered min to allow even smaller chunks
                    maximum=3000,
                    value=500,  # Changed default to 500 for more chunks
                    step=100,
                    label="πŸ“ Chunk Size (words per processing batch)"
                )
                
                mcqs_per_chunk_input = gr.Slider(
                    minimum=5,
                    maximum=50,  # Increased max for more MCQs per chunk
                    value=20,  # New slider for MCQs per chunk, default 20
                    step=5,
                    label="πŸ”’ MCQs per Chunk (higher = more MCQs, but may increase failures)"
                )
                
                generate_btn = gr.Button(
                    "πŸš€ Generate MCQs",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                status_output = gr.Textbox(
                    label="πŸ“Š Status",
                    placeholder="Status updates will appear here...",
                    lines=10
                )
                
                download_file = gr.File(
                    label="⬇️ Download MCQs Excel File",
                    visible=False
                )
        
        # Event handlers
        generate_btn.click(
            fn=process_pdf_to_mcqs,
            inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
            outputs=[status_output, download_file],
            show_progress=True
        ).then(
            fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
            inputs=[download_file],
            outputs=[download_file]
        )
        
        gr.Markdown(
            """
            ## πŸ“‹ Features:
            - **OCR Text Extraction**: Converts PDF pages to images and extracts text
            - **Smart Chunking**: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
            - **Configurable MCQs per Chunk**: Now adjustable up to 50 for maximum generation
            - **Multiple AI Models**: Automatically tries different Gemini models for best results
            - **Excel Output**: Download MCQs in a formatted Excel file
            - **Progress Tracking**: Real-time updates on processing status
            
            ## ⚠️ Notes:
            - To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
            - Processing time depends on PDF length and settings (more MCQs = longer time)
            - Large PDFs are processed in chunks to avoid timeouts
            - Make sure your PDF contains readable text (not just images)
            - API key is not stored and only used for your session
            - Reduced delay between API calls for faster processing, but monitor for rate limits
            """
        )
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)