Ali-Raza-167 commited on
Commit
fc39378
Β·
verified Β·
1 Parent(s): f81ae08

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -0
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pdf2image import convert_from_path
4
+ import pytesseract
5
+ import google.generativeai as genai
6
+ import tempfile
7
+ import os
8
+ import re
9
+ import time
10
+ from io import BytesIO
11
+
12
+ def batch_statements(statements, batch_size=5):
13
+ """Split statements into batches"""
14
+ for i in range(0, len(statements), batch_size):
15
+ yield statements[i:i+batch_size]
16
+
17
+ def extract_text_from_pdf(pdf_file, api_key, progress_callback=None):
18
+ """Extract text from PDF and generate MCQs"""
19
+ if not api_key.strip():
20
+ return None, "Please enter your Google API key"
21
+
22
+ try:
23
+ # Configure Gemini API
24
+ genai.configure(api_key=api_key)
25
+ model = genai.GenerativeModel('gemini-1.5-flash')
26
+
27
+ # Save uploaded file temporarily
28
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
29
+ tmp_file.write(pdf_file)
30
+ tmp_path = tmp_file.name
31
+
32
+ if progress_callback:
33
+ progress_callback(0.1, "Converting PDF to images...")
34
+
35
+ # Convert PDF to images and extract text
36
+ pages = convert_from_path(tmp_path)
37
+ page_texts = []
38
+
39
+ for i, page in enumerate(pages):
40
+ if progress_callback:
41
+ progress_callback(0.1 + (i / len(pages)) * 0.3, f"Processing page {i+1}/{len(pages)}...")
42
+ text = pytesseract.image_to_string(page)
43
+ page_texts.append(text)
44
+
45
+ # Clean up temp file
46
+ os.unlink(tmp_path)
47
+
48
+ if progress_callback:
49
+ progress_callback(0.4, "Splitting text into statements...")
50
+
51
+ # Split into statements
52
+ all_statements = []
53
+ for page_text in page_texts:
54
+ statements = [s.strip() for s in re.split(r'[.!?]', page_text) if s.strip()]
55
+ all_statements.extend(statements)
56
+
57
+ if not all_statements:
58
+ return None, "No text could be extracted from the PDF"
59
+
60
+ if progress_callback:
61
+ progress_callback(0.5, f"Found {len(all_statements)} statements. Creating batches...")
62
+
63
+ # Create batches
64
+ batches = list(batch_statements(all_statements, batch_size=5))
65
+
66
+ if progress_callback:
67
+ progress_callback(0.6, f"Generating MCQs from {len(batches)} batches...")
68
+
69
+ # Generate MCQs
70
+ mcq_data = []
71
+ for i, batch in enumerate(batches):
72
+ if progress_callback:
73
+ progress_callback(0.6 + (i / len(batches)) * 0.3, f"Processing batch {i+1}/{len(batches)}...")
74
+
75
+ text_block = ". ".join(batch)
76
+ prompt = f"""
77
+ Generate exactly 5 multiple choice questions from the following text.
78
+ Each question must have:
79
+ - A clear question
80
+ - 4 options labeled A, B, C, D
81
+ - One correct answer (only the letter A, B, C, or D)
82
+
83
+ Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
84
+
85
+ Text: {text_block}
86
+ """
87
+
88
+ try:
89
+ response = model.generate_content(prompt)
90
+ output = response.text.strip()
91
+
92
+ # Parse CSV output
93
+ for line in output.splitlines():
94
+ if line.strip() and ',' in line:
95
+ parts = [part.strip().strip('"') for part in line.split(',')]
96
+ if len(parts) == 6 and parts[5] in ['A', 'B', 'C', 'D']:
97
+ mcq_data.append(parts)
98
+
99
+ except Exception as e:
100
+ print(f"Error generating MCQ for batch {i+1}: {str(e)}")
101
+ continue
102
+
103
+ # Small delay to avoid rate limiting
104
+ time.sleep(0.1)
105
+
106
+ if progress_callback:
107
+ progress_callback(0.95, "Creating Excel file...")
108
+
109
+ if not mcq_data:
110
+ return None, "No MCQs could be generated from the text"
111
+
112
+ # Create DataFrame and Excel file
113
+ df = pd.DataFrame(mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
114
+
115
+ # Save to BytesIO buffer
116
+ excel_buffer = BytesIO()
117
+ df.to_excel(excel_buffer, index=False, engine='openpyxl')
118
+ excel_buffer.seek(0)
119
+
120
+ if progress_callback:
121
+ progress_callback(1.0, f"Complete! Generated {len(mcq_data)} MCQs")
122
+
123
+ return excel_buffer.getvalue(), f"Successfully generated {len(mcq_data)} MCQs from {len(pages)} pages"
124
+
125
+ except Exception as e:
126
+ return None, f"Error processing PDF: {str(e)}"
127
+
128
+ def process_pdf_with_progress(pdf_file, api_key, progress=gr.Progress()):
129
+ """Wrapper function for Gradio progress tracking"""
130
+ def progress_callback(value, desc):
131
+ progress(value, desc=desc)
132
+
133
+ return extract_text_from_pdf(pdf_file, api_key, progress_callback)
134
+
135
+ # Create Gradio interface
136
+ def create_interface():
137
+ with gr.Blocks(title="PDF to MCQ Generator", theme=gr.themes.Soft()) as app:
138
+ gr.Markdown(
139
+ """
140
+ # πŸ“š PDF to MCQ Generator
141
+
142
+ Upload a PDF file and generate multiple choice questions automatically using Google's Gemini AI.
143
+
144
+ **Instructions:**
145
+ 1. Get your free Google AI API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
146
+ 2. Enter your API key below
147
+ 3. Upload your PDF file
148
+ 4. Click "Generate MCQs" and wait for processing
149
+ 5. Download the generated Excel file with MCQs
150
+ """
151
+ )
152
+
153
+ with gr.Row():
154
+ with gr.Column():
155
+ api_key_input = gr.Textbox(
156
+ label="Google AI API Key",
157
+ placeholder="Enter your Google AI API key here...",
158
+ type="password",
159
+ info="Get your free API key from Google AI Studio"
160
+ )
161
+
162
+ pdf_input = gr.File(
163
+ label="Upload PDF",
164
+ file_types=[".pdf"],
165
+ info="Upload the PDF file you want to convert to MCQs"
166
+ )
167
+
168
+ generate_btn = gr.Button(
169
+ "πŸš€ Generate MCQs",
170
+ variant="primary",
171
+ size="lg"
172
+ )
173
+
174
+ with gr.Column():
175
+ status_output = gr.Textbox(
176
+ label="Status",
177
+ interactive=False,
178
+ info="Processing status will appear here"
179
+ )
180
+
181
+ download_file = gr.File(
182
+ label="Download MCQs Excel File",
183
+ interactive=False
184
+ )
185
+
186
+ gr.Markdown(
187
+ """
188
+ ### Features:
189
+ - πŸ€– Powered by Google's Gemini AI
190
+ - πŸ“„ Extracts text from PDF using OCR
191
+ - ❓ Generates 5 MCQs per text batch
192
+ - πŸ“Š Outputs organized Excel file
193
+ - πŸ”„ Progress tracking during processing
194
+
195
+ ### Tips for better results:
196
+ - Use PDFs with clear, readable text
197
+ - Ensure good image quality for OCR
198
+ - Educational content works best for MCQ generation
199
+ """
200
+ )
201
+
202
+ # Event handler
203
+ generate_btn.click(
204
+ fn=process_pdf_with_progress,
205
+ inputs=[pdf_input, api_key_input],
206
+ outputs=[download_file, status_output],
207
+ show_progress=True
208
+ )
209
+
210
+ return app
211
+
212
+ # Launch the app
213
+ if __name__ == "__main__":
214
+ app = create_interface()
215
+ app.launch(
216
+ share=True,
217
+ server_name="0.0.0.0",
218
+ server_port=7860
219
+ )