Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,7 +31,7 @@ def extract_text_from_pdf(pdf_file_path: str) -> str:
|
|
| 31 |
except Exception as e:
|
| 32 |
return f"Error extracting text: {str(e)}"
|
| 33 |
|
| 34 |
-
def chunk_text(text: str, chunk_size: int =
|
| 35 |
"""Split text into chunks for processing"""
|
| 36 |
words = text.split()
|
| 37 |
chunks = []
|
|
@@ -39,7 +39,7 @@ def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
|
|
| 39 |
chunks.append(' '.join(words[i:i+chunk_size]))
|
| 40 |
return chunks
|
| 41 |
|
| 42 |
-
def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) -> List[List[str]]:
|
| 43 |
"""Generate MCQs from a text chunk using Gemini API"""
|
| 44 |
print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
|
| 45 |
print(f"Chunk length: {len(chunk)} characters")
|
|
@@ -52,7 +52,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
|
|
| 52 |
]
|
| 53 |
|
| 54 |
prompt = f"""
|
| 55 |
-
Generate exactly
|
| 56 |
Each question must have:
|
| 57 |
- A clear, specific question
|
| 58 |
- 4 options labeled A, B, C, D
|
|
@@ -152,7 +152,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
|
|
| 152 |
print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
|
| 153 |
return mcq_data
|
| 154 |
|
| 155 |
-
def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int =
|
| 156 |
"""Main function to process PDF and generate MCQs"""
|
| 157 |
if not api_key:
|
| 158 |
return "β Please provide your Gemini API key", None
|
|
@@ -182,11 +182,11 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
|
|
| 182 |
for i, chunk in enumerate(chunks):
|
| 183 |
progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
|
| 184 |
|
| 185 |
-
chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
|
| 186 |
all_mcq_data.extend(chunk_mcqs)
|
| 187 |
|
| 188 |
-
#
|
| 189 |
-
time.sleep(
|
| 190 |
|
| 191 |
progress(0.95, desc="Creating Excel file...")
|
| 192 |
|
|
@@ -215,7 +215,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
|
|
| 215 |
|
| 216 |
progress(1.0, desc="Complete!")
|
| 217 |
|
| 218 |
-
success_message = f"β
Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks!"
|
| 219 |
|
| 220 |
return success_message, temp_file.name
|
| 221 |
|
|
@@ -235,9 +235,10 @@ def create_interface():
|
|
| 235 |
1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
|
| 236 |
2. Enter your API key below
|
| 237 |
3. Upload your PDF file
|
| 238 |
-
4. Adjust chunk size if needed (
|
| 239 |
-
5.
|
| 240 |
-
6.
|
|
|
|
| 241 |
"""
|
| 242 |
)
|
| 243 |
|
|
@@ -255,13 +256,21 @@ def create_interface():
|
|
| 255 |
)
|
| 256 |
|
| 257 |
chunk_size_input = gr.Slider(
|
| 258 |
-
minimum=
|
| 259 |
maximum=3000,
|
| 260 |
-
value=
|
| 261 |
step=100,
|
| 262 |
label="π Chunk Size (words per processing batch)"
|
| 263 |
)
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
generate_btn = gr.Button(
|
| 266 |
"π Generate MCQs",
|
| 267 |
variant="primary",
|
|
@@ -283,7 +292,7 @@ def create_interface():
|
|
| 283 |
# Event handlers
|
| 284 |
generate_btn.click(
|
| 285 |
fn=process_pdf_to_mcqs,
|
| 286 |
-
inputs=[pdf_input, api_key_input, chunk_size_input],
|
| 287 |
outputs=[status_output, download_file],
|
| 288 |
show_progress=True
|
| 289 |
).then(
|
|
@@ -296,16 +305,19 @@ def create_interface():
|
|
| 296 |
"""
|
| 297 |
## π Features:
|
| 298 |
- **OCR Text Extraction**: Converts PDF pages to images and extracts text
|
| 299 |
-
- **Smart Chunking**: Breaks large documents into manageable pieces
|
|
|
|
| 300 |
- **Multiple AI Models**: Automatically tries different Gemini models for best results
|
| 301 |
- **Excel Output**: Download MCQs in a formatted Excel file
|
| 302 |
- **Progress Tracking**: Real-time updates on processing status
|
| 303 |
|
| 304 |
## β οΈ Notes:
|
| 305 |
-
-
|
|
|
|
| 306 |
- Large PDFs are processed in chunks to avoid timeouts
|
| 307 |
- Make sure your PDF contains readable text (not just images)
|
| 308 |
- API key is not stored and only used for your session
|
|
|
|
| 309 |
"""
|
| 310 |
)
|
| 311 |
|
|
|
|
| 31 |
except Exception as e:
|
| 32 |
return f"Error extracting text: {str(e)}"
|
| 33 |
|
| 34 |
+
def chunk_text(text: str, chunk_size: int = 500) -> List[str]: # Changed default to 500 for more chunks/MCQs
|
| 35 |
"""Split text into chunks for processing"""
|
| 36 |
words = text.split()
|
| 37 |
chunks = []
|
|
|
|
| 39 |
chunks.append(' '.join(words[i:i+chunk_size]))
|
| 40 |
return chunks
|
| 41 |
|
| 42 |
+
def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]: # Added mcqs_per_chunk param, default 20
|
| 43 |
"""Generate MCQs from a text chunk using Gemini API"""
|
| 44 |
print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
|
| 45 |
print(f"Chunk length: {len(chunk)} characters")
|
|
|
|
| 52 |
]
|
| 53 |
|
| 54 |
prompt = f"""
|
| 55 |
+
Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
|
| 56 |
Each question must have:
|
| 57 |
- A clear, specific question
|
| 58 |
- 4 options labeled A, B, C, D
|
|
|
|
| 152 |
print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
|
| 153 |
return mcq_data
|
| 154 |
|
| 155 |
+
def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]: # Added mcqs_per_chunk param, default 20
|
| 156 |
"""Main function to process PDF and generate MCQs"""
|
| 157 |
if not api_key:
|
| 158 |
return "β Please provide your Gemini API key", None
|
|
|
|
| 182 |
for i, chunk in enumerate(chunks):
|
| 183 |
progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
|
| 184 |
|
| 185 |
+
chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
|
| 186 |
all_mcq_data.extend(chunk_mcqs)
|
| 187 |
|
| 188 |
+
# Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
|
| 189 |
+
time.sleep(0.5)
|
| 190 |
|
| 191 |
progress(0.95, desc="Creating Excel file...")
|
| 192 |
|
|
|
|
| 215 |
|
| 216 |
progress(1.0, desc="Complete!")
|
| 217 |
|
| 218 |
+
success_message = f"β
Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"
|
| 219 |
|
| 220 |
return success_message, temp_file.name
|
| 221 |
|
|
|
|
| 235 |
1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
|
| 236 |
2. Enter your API key below
|
| 237 |
3. Upload your PDF file
|
| 238 |
+
4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
|
| 239 |
+
5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
|
| 240 |
+
6. Click "Generate MCQs" and wait for processing
|
| 241 |
+
7. Download the generated Excel file with your MCQs
|
| 242 |
"""
|
| 243 |
)
|
| 244 |
|
|
|
|
| 256 |
)
|
| 257 |
|
| 258 |
chunk_size_input = gr.Slider(
|
| 259 |
+
minimum=300, # Lowered min to allow even smaller chunks
|
| 260 |
maximum=3000,
|
| 261 |
+
value=500, # Changed default to 500 for more chunks
|
| 262 |
step=100,
|
| 263 |
label="π Chunk Size (words per processing batch)"
|
| 264 |
)
|
| 265 |
|
| 266 |
+
mcqs_per_chunk_input = gr.Slider(
|
| 267 |
+
minimum=5,
|
| 268 |
+
maximum=50, # Increased max for more MCQs per chunk
|
| 269 |
+
value=20, # New slider for MCQs per chunk, default 20
|
| 270 |
+
step=5,
|
| 271 |
+
label="π’ MCQs per Chunk (higher = more MCQs, but may increase failures)"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
generate_btn = gr.Button(
|
| 275 |
"π Generate MCQs",
|
| 276 |
variant="primary",
|
|
|
|
| 292 |
# Event handlers
|
| 293 |
generate_btn.click(
|
| 294 |
fn=process_pdf_to_mcqs,
|
| 295 |
+
inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
|
| 296 |
outputs=[status_output, download_file],
|
| 297 |
show_progress=True
|
| 298 |
).then(
|
|
|
|
| 305 |
"""
|
| 306 |
## π Features:
|
| 307 |
- **OCR Text Extraction**: Converts PDF pages to images and extracts text
|
| 308 |
+
- **Smart Chunking**: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
|
| 309 |
+
- **Configurable MCQs per Chunk**: Now adjustable up to 50 for maximum generation
|
| 310 |
- **Multiple AI Models**: Automatically tries different Gemini models for best results
|
| 311 |
- **Excel Output**: Download MCQs in a formatted Excel file
|
| 312 |
- **Progress Tracking**: Real-time updates on processing status
|
| 313 |
|
| 314 |
## β οΈ Notes:
|
| 315 |
+
- To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
|
| 316 |
+
- Processing time depends on PDF length and settings (more MCQs = longer time)
|
| 317 |
- Large PDFs are processed in chunks to avoid timeouts
|
| 318 |
- Make sure your PDF contains readable text (not just images)
|
| 319 |
- API key is not stored and only used for your session
|
| 320 |
+
- Reduced delay between API calls for faster processing, but monitor for rate limits
|
| 321 |
"""
|
| 322 |
)
|
| 323 |
|