PDF-Processor

Sleeping

App Files Files Community

167AliRaza commited on Sep 18, 2025

Commit

65956db

verified ·

1 Parent(s): 95fb4d2

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -28

app.py CHANGED Viewed

@@ -15,25 +15,17 @@ def configure_gemini_api(api_key: str):
     genai.configure(api_key=api_key)
     return "✅ API Key configured successfully!"
-def extract_text_from_pdf(pdf_file) -> str:
     """Extract text from PDF using OCR"""
     try:
-        # Create temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
-            tmp_file.write(pdf_file)
-            tmp_path = tmp_file.name
         # Convert PDF to images
-        pages = convert_from_path(tmp_path)
         all_text = ""
         for i, page in enumerate(pages):
             text = pytesseract.image_to_string(page)
             all_text += text + "\n"
-        # Clean up temporary file
-        os.unlink(tmp_path)
         return all_text
     except Exception as e:
         return f"Error extracting text: {str(e)}"
@@ -124,25 +116,25 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
 def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
     """Main function to process PDF and generate MCQs"""
     if not api_key:
-        return "❌ Please provide your Gemini API key", ""
     if not pdf_file:
-        return "❌ Please upload a PDF file", ""
     try:
         # Extract text from PDF
         progress(0.1, desc="Extracting text from PDF...")
-        extracted_text = extract_text_from_pdf(pdf_file)
         if extracted_text.startswith("Error"):
-            return extracted_text, ""
         # Chunk the text
         progress(0.2, desc="Chunking text...")
         chunks = chunk_text(extracted_text, chunk_size)
         if not chunks:
-            return "❌ No text could be extracted from the PDF", ""
         # Generate MCQs from each chunk
         all_mcq_data = []
@@ -160,22 +152,18 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         progress(0.95, desc="Creating Excel file...")
         if not all_mcq_data:
-            return "❌ No MCQs could be generated from the PDF content", ""
         # Create DataFrame
         df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
-        # Create Excel file in memory
-        output = io.BytesIO()
-        with pd.ExcelWriter(output, engine='openpyxl') as writer:
-            df.to_excel(writer, index=False, sheet_name='MCQs')
-        output.seek(0)
-        # Save to temporary file for download
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
-        temp_file.write(output.getvalue())
-        temp_file.close()
         progress(1.0, desc="Complete!")
@@ -184,7 +172,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         return success_message, temp_file.name
     except Exception as e:
-        return f"❌ Error processing PDF: {str(e)}", ""
 # Create Gradio interface
 def create_interface():
@@ -251,7 +239,7 @@ def create_interface():
             outputs=[status_output, download_file],
             show_progress=True
         ).then(
-            fn=lambda x: gr.update(visible=bool(x)),
             inputs=[download_file],
             outputs=[download_file]
         )

     genai.configure(api_key=api_key)
     return "✅ API Key configured successfully!"
+def extract_text_from_pdf(pdf_file_path: str) -> str:
     """Extract text from PDF using OCR"""
     try:
         # Convert PDF to images
+        pages = convert_from_path(pdf_file_path)
         all_text = ""
         for i, page in enumerate(pages):
             text = pytesseract.image_to_string(page)
             all_text += text + "\n"
         return all_text
     except Exception as e:
         return f"Error extracting text: {str(e)}"
 def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
     """Main function to process PDF and generate MCQs"""
     if not api_key:
+        return "❌ Please provide your Gemini API key", None
     if not pdf_file:
+        return "❌ Please upload a PDF file", None
     try:
         # Extract text from PDF
         progress(0.1, desc="Extracting text from PDF...")
+        extracted_text = extract_text_from_pdf(pdf_file.name)
         if extracted_text.startswith("Error"):
+            return extracted_text, None
         # Chunk the text
         progress(0.2, desc="Chunking text...")
         chunks = chunk_text(extracted_text, chunk_size)
         if not chunks:
+            return "❌ No text could be extracted from the PDF", None
         # Generate MCQs from each chunk
         all_mcq_data = []
         progress(0.95, desc="Creating Excel file...")
         if not all_mcq_data:
+            return "❌ No MCQs could be generated from the PDF content", None
         # Create DataFrame
         df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
+        # Create temporary Excel file for download
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
+        temp_file.close()  # Close to allow pandas to write to it
+        # Write Excel file
+        with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
+            df.to_excel(writer, index=False, sheet_name='MCQs')
         progress(1.0, desc="Complete!")
         return success_message, temp_file.name
     except Exception as e:
+        return f"❌ Error processing PDF: {str(e)}", None
 # Create Gradio interface
 def create_interface():
             outputs=[status_output, download_file],
             show_progress=True
         ).then(
+            fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
             inputs=[download_file],
             outputs=[download_file]
         )