PDF-Processor

Sleeping

App Files Files Community

167AliRaza commited on Sep 18, 2025

Commit

238717d

verified ·

1 Parent(s): 65956db

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -31

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import gradio as gr
 import google.generativeai as genai
 from typing import List, Tuple
 import time
 # Configure Gemini API
 def configure_gemini_api(api_key: str):
@@ -47,20 +48,23 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
     ]
     prompt = f"""
-    Generate 10 multiple choice questions from the following text.
     Each question must have:
     - A clear, specific question
     - 4 options labeled A, B, C, D
     - One correct answer (A, B, C, or D)
-    Format your response as CSV with headers: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
-    Important formatting rules:
     - Use commas only as field separators
     - If any field contains a comma, wrap it in double quotes
-    - Each row should be on a new line
-    - Make questions specific and clear
-    - Ensure options are distinct and plausible
     Text to analyze:
     {chunk}
@@ -85,32 +89,41 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
     if response and response.text:
         output = response.text.strip()
-        lines = output.splitlines()
-        # Skip header if present
-        for line in lines[1:] if lines and 'Question' in lines[0] else lines:
-            if line.strip():
-                # Simple CSV parsing (you might want to use csv module for better handling)
-                parts = []
-                current_part = ""
-                in_quotes = False
-                for char in line:
-                    if char == '"':
-                        in_quotes = not in_quotes
-                    elif char == ',' and not in_quotes:
-                        parts.append(current_part.strip().strip('"'))
-                        current_part = ""
-                    else:
-                        current_part += char
-                # Add the last part
-                if current_part:
-                    parts.append(current_part.strip().strip('"'))
-                if len(parts) >= 6 and parts[0].strip():
-                    mcq_data.append(parts[:6])
     return mcq_data
 def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
@@ -154,8 +167,17 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         if not all_mcq_data:
             return "❌ No MCQs could be generated from the PDF content", None
         # Create DataFrame
-        df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
         # Create temporary Excel file for download
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
@@ -167,7 +189,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
         progress(1.0, desc="Complete!")
-        success_message = f"✅ Successfully generated {len(all_mcq_data)} MCQs from {total_chunks} text chunks!"
         return success_message, temp_file.name

 import google.generativeai as genai
 from typing import List, Tuple
 import time
+import csv
 # Configure Gemini API
 def configure_gemini_api(api_key: str):
     ]
     prompt = f"""
+    Generate exactly 10 multiple choice questions from the following text.
     Each question must have:
     - A clear, specific question
     - 4 options labeled A, B, C, D
     - One correct answer (A, B, C, or D)
+    IMPORTANT: Do NOT include any headers or column names in your response.
+    Format each question as: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
+    Rules:
+    - Start directly with the first question, no headers
     - Use commas only as field separators
     - If any field contains a comma, wrap it in double quotes
+    - Each question should be on a new line
+    - Make questions specific and clear based on the text content
+    - Ensure all 4 options are plausible but only one is correct
+    - The correct answer should be A, B, C, or D only
     Text to analyze:
     {chunk}
     if response and response.text:
         output = response.text.strip()
+        lines = [line.strip() for line in output.splitlines() if line.strip()]
+        for line in lines:
+            # Skip any header lines that might still appear
+            if ('Question' in line and 'OptionA' in line and 'OptionB' in line) or line.startswith('Question,'):
+                continue
+            # Skip empty lines or lines that don't look like MCQs
+            if not line or line.count(',') < 5:
+                continue
+            # Parse CSV line using proper CSV parsing
+            import csv
+            try:
+                # Use StringIO to parse the line as CSV
+                csv_reader = csv.reader([line])
+                parts = next(csv_reader)
+                # Ensure we have exactly 6 parts and the question is not empty
+                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
+                    # Clean up each part
+                    cleaned_parts = [part.strip() for part in parts[:6]]
+                    # Validate that correct answer is A, B, C, or D
+                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
+                        mcq_data.append(cleaned_parts)
+            except csv.Error:
+                # Fallback to simple split if CSV parsing fails
+                parts = line.split(',')
+                if len(parts) >= 6 and parts[0].strip() and not parts[0].lower().startswith('question'):
+                    cleaned_parts = [part.strip().strip('"') for part in parts[:6]]
+                    if cleaned_parts[5].upper() in ['A', 'B', 'C', 'D']:
+                        mcq_data.append(cleaned_parts)
+    print(f"Generated {len(mcq_data)} MCQs from chunk")
     return mcq_data
 def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
         if not all_mcq_data:
             return "❌ No MCQs could be generated from the PDF content", None
+        # Remove any duplicate questions
+        seen_questions = set()
+        unique_mcq_data = []
+        for mcq in all_mcq_data:
+            question_text = mcq[0].lower().strip()
+            if question_text not in seen_questions:
+                seen_questions.add(question_text)
+                unique_mcq_data.append(mcq)
         # Create DataFrame
+        df = pd.DataFrame(unique_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
         # Create temporary Excel file for download
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
         progress(1.0, desc="Complete!")
+        success_message = f"✅ Successfully generated {len(unique_mcq_data)} unique MCQs from {total_chunks} text chunks!"
         return success_message, temp_file.name