PDF-Processor

Sleeping

App Files Files Community

167AliRaza commited on Sep 18, 2025

Commit

d20c5e7

verified ·

1 Parent(s): bce1ee4

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -21

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from pdf2image import convert_from_path
 import pytesseract
 import google.generativeai as genai
 import tempfile
 # Function: Extract text from PDF using OCR
 def extract_text_from_pdf(pdf_file):
@@ -18,7 +20,7 @@ def extract_text_from_pdf(pdf_file):
 def chunk_text(text, chunk_size=1500):
     words = text.split()
     for i in range(0, len(words), chunk_size):
-        yield ' '.join(words[i:i+chunk_size])
 # Models to try (fallbacks)
 models_to_try = [
@@ -28,7 +30,7 @@ models_to_try = [
     "gemini-2.0-flash-lite",
     "gemini-2.0-flash",
     "gemini-1.5-flash",
-    "gemini-1.5-pro"
 ]
 # Function: Generate MCQs
@@ -39,14 +41,18 @@ def generate_mcqs(text, api_key):
     for i, chunk in enumerate(chunks, start=1):
         prompt = f"""
-        Generate 10 MCQs from the following text.
-        Each question must have:
-        - Question
-        - 4 Options (A-D)
-        - Correct Answer
-        Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer.
-        Text:\n{chunk}
-        """
         response = None
         for model_name in models_to_try:
@@ -60,17 +66,22 @@ def generate_mcqs(text, api_key):
         if response and response.text:
             output = response.text.strip()
-            for line in output.splitlines():
-                parts = line.split(",")
-                if len(parts) >= 6 and parts[0]:
-                    mcq_data.append(parts)
-    filtered_mcq_data = [row for row in mcq_data if len(row) == 6]
-    if not filtered_mcq_data:
         return None, None
-    df = pd.DataFrame(filtered_mcq_data, columns=["Question", "OptionA", "OptionB", "OptionC", "OptionD", "CorrectAnswer"])
-    return df, df.head(10).to_string(index=False)  # Preview as plain table
 # Gradio pipeline
 def process_pdf(pdf_file, api_key):
@@ -95,19 +106,21 @@ def process_pdf(pdf_file, api_key):
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## 📘 PDF to MCQ Generator (Gemini AI)")
-    gr.Markdown("Upload a PDF, enter your Gemini API key, extract text with OCR, and generate MCQs saved as Excel.")
     api_key = gr.Textbox(label="Enter your Gemini API Key", type="password")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     generate_btn = gr.Button("Generate MCQs")
     preview_output = gr.Textbox(label="Preview (First 10 MCQs)", lines=15)
     excel_output = gr.File(label="Download Excel (.xlsx)")
     generate_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, api_key],
-        outputs=[preview_output, excel_output]
     )
 # Run app

 import pytesseract
 import google.generativeai as genai
 import tempfile
+import csv
+from io import StringIO
 # Function: Extract text from PDF using OCR
 def extract_text_from_pdf(pdf_file):
 def chunk_text(text, chunk_size=1500):
     words = text.split()
     for i in range(0, len(words), chunk_size):
+        yield " ".join(words[i:i+chunk_size])
 # Models to try (fallbacks)
 models_to_try = [
     "gemini-2.0-flash-lite",
     "gemini-2.0-flash",
     "gemini-1.5-flash",
+    "gemini-1.5-pro",
 ]
 # Function: Generate MCQs
     for i, chunk in enumerate(chunks, start=1):
         prompt = f"""
+Generate 10 MCQs from the following text.
+Return ONLY valid CSV rows with exactly 6 columns:
+Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
+Rules:
+- Do NOT add numbering, quotes, or explanations.
+- Do NOT add headers.
+- Do NOT add extra commas inside cells.
+- Exactly 10 rows per chunk.
+Text:\n{chunk}
+"""
         response = None
         for model_name in models_to_try:
         if response and response.text:
             output = response.text.strip()
+            try:
+                reader = csv.reader(StringIO(output))
+                for row in reader:
+                    if len(row) >= 6 and row[0]:
+                        mcq_data.append(row[:6])  # keep only first 6 cols
+            except Exception:
+                continue
+    if not mcq_data:
         return None, None
+    df = pd.DataFrame(
+        mcq_data,
+        columns=["Question", "OptionA", "OptionB", "OptionC", "OptionD", "CorrectAnswer"],
+    )
+    return df, df.head(10).to_string(index=False)
 # Gradio pipeline
 def process_pdf(pdf_file, api_key):
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## 📘 PDF to MCQ Generator (Gemini AI)")
+    gr.Markdown(
+        "Upload a PDF, enter your Gemini API key, extract text with OCR, and generate MCQs saved as Excel."
+    )
     api_key = gr.Textbox(label="Enter your Gemini API Key", type="password")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     generate_btn = gr.Button("Generate MCQs")
     preview_output = gr.Textbox(label="Preview (First 10 MCQs)", lines=15)
     excel_output = gr.File(label="Download Excel (.xlsx)")
     generate_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, api_key],
+        outputs=[preview_output, excel_output],
     )
 # Run app