PDF-Processor

Sleeping

App Files Files Community

167AliRaza commited on Sep 18, 2025

Commit

54b1661

verified ·

1 Parent(s): 41c3d7f

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -16

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def extract_text_from_pdf(pdf_file_path: str) -> str:
     except Exception as e:
         return f"Error extracting text: {str(e)}"
-def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
     """Split text into chunks for processing"""
     words = text.split()
     chunks = []
@@ -39,7 +39,7 @@ def chunk_text(text: str, chunk_size: int = 1500) -> List[str]:
         chunks.append(' '.join(words[i:i+chunk_size]))
     return chunks
-def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) -> List[List[str]]:
     """Generate MCQs from a text chunk using Gemini API"""
     print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
     print(f"Chunk length: {len(chunk)} characters")
@@ -52,7 +52,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
     ]
     prompt = f"""
-    Generate exactly 10 multiple choice questions from the following text.
     Each question must have:
     - A clear, specific question
     - 4 options labeled A, B, C, D
@@ -152,7 +152,7 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1) ->
     print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
     return mcq_data
-def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
     """Main function to process PDF and generate MCQs"""
     if not api_key:
         return "❌ Please provide your Gemini API key", None
@@ -182,11 +182,11 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         for i, chunk in enumerate(chunks):
             progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
-            chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key)
             all_mcq_data.extend(chunk_mcqs)
-            # Add small delay to avoid rate limiting
-            time.sleep(1)
         progress(0.95, desc="Creating Excel file...")
@@ -215,7 +215,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         progress(1.0, desc="Complete!")
-        success_message = f"✅ Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks!"
         return success_message, temp_file.name
@@ -235,9 +235,10 @@ def create_interface():
             1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
             2. Enter your API key below
             3. Upload your PDF file
-            4. Adjust chunk size if needed (larger = fewer API calls, smaller = more focused questions)
-            5. Click "Generate MCQs" and wait for processing
-            6. Download the generated Excel file with your MCQs
             """
         )
@@ -255,13 +256,21 @@ def create_interface():
                 )
                 chunk_size_input = gr.Slider(
-                    minimum=500,
                     maximum=3000,
-                    value=1500,
                     step=100,
                     label="📝 Chunk Size (words per processing batch)"
                 )
                 generate_btn = gr.Button(
                     "🚀 Generate MCQs",
                     variant="primary",
@@ -283,7 +292,7 @@ def create_interface():
         # Event handlers
         generate_btn.click(
             fn=process_pdf_to_mcqs,
-            inputs=[pdf_input, api_key_input, chunk_size_input],
             outputs=[status_output, download_file],
             show_progress=True
         ).then(
@@ -296,16 +305,19 @@ def create_interface():
             """
             ## 📋 Features:
             - **OCR Text Extraction**: Converts PDF pages to images and extracts text
-            - **Smart Chunking**: Breaks large documents into manageable pieces
             - **Multiple AI Models**: Automatically tries different Gemini models for best results
             - **Excel Output**: Download MCQs in a formatted Excel file
             - **Progress Tracking**: Real-time updates on processing status
             ## ⚠️ Notes:
-            - Processing time depends on PDF length and complexity
             - Large PDFs are processed in chunks to avoid timeouts
             - Make sure your PDF contains readable text (not just images)
             - API key is not stored and only used for your session
             """
         )

     except Exception as e:
         return f"Error extracting text: {str(e)}"
+def chunk_text(text: str, chunk_size: int = 500) -> List[str]:  # Changed default to 500 for more chunks/MCQs
     """Split text into chunks for processing"""
     words = text.split()
     chunks = []
         chunks.append(' '.join(words[i:i+chunk_size]))
     return chunks
+def generate_mcqs_from_chunk(chunk: str, api_key: str, chunk_number: int = 1, mcqs_per_chunk: int = 20) -> List[List[str]]:  # Added mcqs_per_chunk param, default 20
     """Generate MCQs from a text chunk using Gemini API"""
     print(f"\n=== PROCESSING CHUNK {chunk_number} ===")
     print(f"Chunk length: {len(chunk)} characters")
     ]
     prompt = f"""
+    Generate exactly {mcqs_per_chunk} multiple choice questions from the following text.
     Each question must have:
     - A clear, specific question
     - 4 options labeled A, B, C, D
     print(f"Generated {len(mcq_data)} MCQs from chunk {chunk_number}")
     return mcq_data
+def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 500, mcqs_per_chunk: int = 20, progress=gr.Progress()) -> Tuple[str, str]:  # Added mcqs_per_chunk param, default 20
     """Main function to process PDF and generate MCQs"""
     if not api_key:
         return "❌ Please provide your Gemini API key", None
         for i, chunk in enumerate(chunks):
             progress((0.2 + (i / total_chunks) * 0.7), desc=f"Processing chunk {i+1}/{total_chunks}...")
+            chunk_mcqs = generate_mcqs_from_chunk(chunk, api_key, i+1, mcqs_per_chunk)
             all_mcq_data.extend(chunk_mcqs)
+            # Reduced delay to 0.5s for faster processing (to maximize MCQs, but monitor rate limits)
+            time.sleep(0.5)
         progress(0.95, desc="Creating Excel file...")
         progress(1.0, desc="Complete!")
+        success_message = f"✅ Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks ({mcqs_per_chunk} targeted per chunk)!"
         return success_message, temp_file.name
             1. Get your Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
             2. Enter your API key below
             3. Upload your PDF file
+            4. Adjust chunk size if needed (smaller = more chunks/MCQs, but slower; default 500 for max MCQs)
+            5. Adjust MCQs per chunk (higher = more MCQs per chunk, but may hit API limits; default 20 for max)
+            6. Click "Generate MCQs" and wait for processing
+            7. Download the generated Excel file with your MCQs
             """
         )
                 )
                 chunk_size_input = gr.Slider(
+                    minimum=300,  # Lowered min to allow even smaller chunks
                     maximum=3000,
+                    value=500,  # Changed default to 500 for more chunks
                     step=100,
                     label="📝 Chunk Size (words per processing batch)"
                 )
+                mcqs_per_chunk_input = gr.Slider(
+                    minimum=5,
+                    maximum=50,  # Increased max for more MCQs per chunk
+                    value=20,  # New slider for MCQs per chunk, default 20
+                    step=5,
+                    label="🔢 MCQs per Chunk (higher = more MCQs, but may increase failures)"
+                )
                 generate_btn = gr.Button(
                     "🚀 Generate MCQs",
                     variant="primary",
         # Event handlers
         generate_btn.click(
             fn=process_pdf_to_mcqs,
+            inputs=[pdf_input, api_key_input, chunk_size_input, mcqs_per_chunk_input],
             outputs=[status_output, download_file],
             show_progress=True
         ).then(
             """
             ## 📋 Features:
             - **OCR Text Extraction**: Converts PDF pages to images and extracts text
+            - **Smart Chunking**: Breaks large documents into manageable pieces (smaller chunks = more MCQs)
+            - **Configurable MCQs per Chunk**: Now adjustable up to 50 for maximum generation
             - **Multiple AI Models**: Automatically tries different Gemini models for best results
             - **Excel Output**: Download MCQs in a formatted Excel file
             - **Progress Tracking**: Real-time updates on processing status
             ## ⚠️ Notes:
+            - To maximize MCQs: Use small chunk size (e.g., 300-500) and high MCQs per chunk (e.g., 20-50)
+            - Processing time depends on PDF length and settings (more MCQs = longer time)
             - Large PDFs are processed in chunks to avoid timeouts
             - Make sure your PDF contains readable text (not just images)
             - API key is not stored and only used for your session
+            - Reduced delay between API calls for faster processing, but monitor for rate limits
             """
         )