167AliRaza commited on
Commit
54b1661
Β·
verified Β·
1 Parent(s): 41c3d7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -16
app.py CHANGED
@@ -31,7 +31,7 @@ def extract_text_from_pdf(pdf_file_path: str) -> str:
31
  except Exception as e:
32
  return f"Error extracting text: {str(e)}"
33
 
34
- def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
35
  """Split text into chunks for processing"""
36
  words = text.split()
37
  chunks = []
@@ -39,7 +39,7 @@ def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
39
  chunks.append(' '.join(words[i:i+chunk_size]))
40
  return chunks
41
 
42
- def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) -> List[List[str]]:
43
  """Generate MCQs from a text chunk using Gemini API"""
44
  print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
45
  print(f"Chunk length: {len(chunk)} characters")
@@ -52,7 +52,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
52
  ]
53
 
54
  prompt = f"""
55
- Generate exactly 10 multiple choice questions from the following text.
56
  Each question must have:
57
  - A clear, specific question
58
  - 4 options labeled A, B, C, D
@@ -152,7 +152,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
152
  print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
153
  return mcq_data
154
 
155
- def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
156
  """Main function to process PDF and generate MCQs"""
157
  if not api_key:
158
  return "❌ Please provide your Gemini API key", None
@@ -182,11 +182,11 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
182
  for i, chunk in enumerate(chunks):
183
  progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
184
 
185
- chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
186
  all_mcq_data.extend(chunk_mcqs)
187
 
188
- # Add small delay to avoid rate limiting
189
- time.sleep(1)
190
 
191
  progress(0.95, desc="Creating Excel file...")
192
 
@@ -215,7 +215,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
215
 
216
  progress(1.0, desc="Complete!")
217
 
218
- success_message = f"βœ… Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks!"
219
 
220
  return success_message, temp_file.name
221
 
@@ -235,9 +235,10 @@ def create_interface():
235
  1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
236
  2. Enter your API key below
237
  3. Upload your PDF file
238
- 4. Adjust chunk size if needed (larger = fewer API calls, smaller = more focused questions)
239
- 5. Click "Generate MCQs" and wait for processing
240
- 6. Download the generated Excel file with your MCQs
 
241
  """
242
  )
243
 
@@ -255,13 +256,21 @@ def create_interface():
255
  )
256
 
257
  chunk_size_input = gr.Slider(
258
- minimum=500,
259
  maximum=3000,
260
- value=1500,
261
  step=100,
262
  label="πŸ“ Chunk Size (words per processing batch)"
263
  )
264
 
 
 
 
 
 
 
 
 
265
  generate_btn = gr.Button(
266
  "πŸš€ Generate MCQs",
267
  variant="primary",
@@ -283,7 +292,7 @@ def create_interface():
283
  # Event handlers
284
  generate_btn.click(
285
  fn=process_pdf_to_mcqs,
286
- inputs=[pdf_input, api_key_input, chunk_size_input],
287
  outputs=[status_output, download_file],
288
  show_progress=True
289
  ).then(
@@ -296,16 +305,19 @@ def create_interface():
296
  """
297
  ## πŸ“‹ Features:
298
  - **OCR Text Extraction**: Converts PDF pages to images and extracts text
299
- - **Smart Chunking**: Breaks large documents into manageable pieces
 
300
  - **Multiple AI Models**: Automatically tries different Gemini models for best results
301
  - **Excel Output**: Download MCQs in a formatted Excel file
302
  - **Progress Tracking**: Real-time updates on processing status
303
 
304
  ## ⚠️ Notes:
305
- - Processing time depends on PDF length and complexity
 
306
  - Large PDFs are processed in chunks to avoid timeouts
307
  - Make sure your PDF contains readable text (not just images)
308
  - API key is not stored and only used for your session
 
309
  """
310
  )
311
 
 
31
  except Exception as e:
32
  return f"Error extracting text: {str(e)}"
33
 
34
+ def chunk_text(text: str, chunk_size: int = 500) -> List[str]: # Changed default to 500 for more chunks/MCQs
35
  """Split text into chunks for processing"""
36
  words = text.split()
37
  chunks = []
 
39
  chunks.append(' '.join(words[i:i+chunk_size]))
40
  return chunks
41
 
42
+ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]: # Added mcqs_per_chunk param, default 20
43
  """Generate MCQs from a text chunk using Gemini API"""
44
  print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
45
  print(f"Chunk length: {len(chunk)} characters")
 
52
  ]
53
 
54
  prompt = f"""
55
+ Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
56
  Each question must have:
57
  - A clear, specific question
58
  - 4 options labeled A, B, C, D
 
152
  print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
153
  return mcq_data
154
 
155
+ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]: # Added mcqs_per_chunk param, default 20
156
  """Main function to process PDF and generate MCQs"""
157
  if not api_key:
158
  return "❌ Please provide your Gemini API key", None
 
182
  for i, chunk in enumerate(chunks):
183
  progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
184
 
185
+ chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
186
  all_mcq_data.extend(chunk_mcqs)
187
 
188
+ # Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
189
+ time.sleep(0.5)
190
 
191
  progress(0.95, desc="Creating Excel file...")
192
 
 
215
 
216
  progress(1.0, desc="Complete!")
217
 
218
+ success_message = f"βœ… Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"
219
 
220
  return success_message, temp_file.name
221
 
 
235
  1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
236
  2. Enter your API key below
237
  3. Upload your PDF file
238
+ 4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
239
+ 5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
240
+ 6. Click "Generate MCQs" and wait for processing
241
+ 7. Download the generated Excel file with your MCQs
242
  """
243
  )
244
 
 
256
  )
257
 
258
  chunk_size_input = gr.Slider(
259
+ minimum=300, # Lowered min to allow even smaller chunks
260
  maximum=3000,
261
+ value=500, # Changed default to 500 for more chunks
262
  step=100,
263
  label="πŸ“ Chunk Size (words per processing batch)"
264
  )
265
 
266
+ mcqs_per_chunk_input = gr.Slider(
267
+ minimum=5,
268
+ maximum=50, # Increased max for more MCQs per chunk
269
+ value=20, # New slider for MCQs per chunk, default 20
270
+ step=5,
271
+ label="πŸ”’ MCQs per Chunk (higher = more MCQs, but may increase failures)"
272
+ )
273
+
274
  generate_btn = gr.Button(
275
  "πŸš€ Generate MCQs",
276
  variant="primary",
 
292
  # Event handlers
293
  generate_btn.click(
294
  fn=process_pdf_to_mcqs,
295
+ inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
296
  outputs=[status_output, download_file],
297
  show_progress=True
298
  ).then(
 
305
  """
306
  ## πŸ“‹ Features:
307
  - **OCR Text Extraction**: Converts PDF pages to images and extracts text
308
+ - **Smart Chunking**: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
309
+ - **Configurable MCQs per Chunk**: Now adjustable up to 50 for maximum generation
310
  - **Multiple AI Models**: Automatically tries different Gemini models for best results
311
  - **Excel Output**: Download MCQs in a formatted Excel file
312
  - **Progress Tracking**: Real-time updates on processing status
313
 
314
  ## ⚠️ Notes:
315
+ - To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
316
+ - Processing time depends on PDF length and settings (more MCQs = longer time)
317
  - Large PDFs are processed in chunks to avoid timeouts
318
  - Make sure your PDF contains readable text (not just images)
319
  - API key is not stored and only used for your session
320
+ - Reduced delay between API calls for faster processing, but monitor for rate limits
321
  """
322
  )
323