167AliRaza commited on
Commit
d20c5e7
·
verified ·
1 Parent(s): bce1ee4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -21
app.py CHANGED
@@ -4,6 +4,8 @@ from pdf2image import convert_from_path
4
  import pytesseract
5
  import google.generativeai as genai
6
  import tempfile
 
 
7
 
8
  # Function: Extract text from PDF using OCR
9
  def extract_text_from_pdf(pdf_file):
@@ -18,7 +20,7 @@ def extract_text_from_pdf(pdf_file):
18
  def chunk_text(text, chunk_size=1500):
19
  words = text.split()
20
  for i in range(0, len(words), chunk_size):
21
- yield ' '.join(words[i:i+chunk_size])
22
 
23
  # Models to try (fallbacks)
24
  models_to_try = [
@@ -28,7 +30,7 @@ models_to_try = [
28
  "gemini-2.0-flash-lite",
29
  "gemini-2.0-flash",
30
  "gemini-1.5-flash",
31
- "gemini-1.5-pro"
32
  ]
33
 
34
  # Function: Generate MCQs
@@ -39,14 +41,18 @@ def generate_mcqs(text, api_key):
39
 
40
  for i, chunk in enumerate(chunks, start=1):
41
  prompt = f"""
42
- Generate 10 MCQs from the following text.
43
- Each question must have:
44
- - Question
45
- - 4 Options (A-D)
46
- - Correct Answer
47
- Return in CSV format: Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer.
48
- Text:\n{chunk}
49
- """
 
 
 
 
50
 
51
  response = None
52
  for model_name in models_to_try:
@@ -60,17 +66,22 @@ def generate_mcqs(text, api_key):
60
 
61
  if response and response.text:
62
  output = response.text.strip()
63
- for line in output.splitlines():
64
- parts = line.split(",")
65
- if len(parts) >= 6 and parts[0]:
66
- mcq_data.append(parts)
 
 
 
67
 
68
- filtered_mcq_data = [row for row in mcq_data if len(row) == 6]
69
- if not filtered_mcq_data:
70
  return None, None
71
 
72
- df = pd.DataFrame(filtered_mcq_data, columns=["Question", "OptionA", "OptionB", "OptionC", "OptionD", "CorrectAnswer"])
73
- return df, df.head(10).to_string(index=False) # Preview as plain table
 
 
 
74
 
75
  # Gradio pipeline
76
  def process_pdf(pdf_file, api_key):
@@ -95,19 +106,21 @@ def process_pdf(pdf_file, api_key):
95
  # Gradio UI
96
  with gr.Blocks() as demo:
97
  gr.Markdown("## 📘 PDF to MCQ Generator (Gemini AI)")
98
- gr.Markdown("Upload a PDF, enter your Gemini API key, extract text with OCR, and generate MCQs saved as Excel.")
 
 
99
 
100
  api_key = gr.Textbox(label="Enter your Gemini API Key", type="password")
101
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
102
  generate_btn = gr.Button("Generate MCQs")
103
-
104
  preview_output = gr.Textbox(label="Preview (First 10 MCQs)", lines=15)
105
  excel_output = gr.File(label="Download Excel (.xlsx)")
106
 
107
  generate_btn.click(
108
  fn=process_pdf,
109
  inputs=[pdf_input, api_key],
110
- outputs=[preview_output, excel_output]
111
  )
112
 
113
  # Run app
 
4
  import pytesseract
5
  import google.generativeai as genai
6
  import tempfile
7
+ import csv
8
+ from io import StringIO
9
 
10
  # Function: Extract text from PDF using OCR
11
  def extract_text_from_pdf(pdf_file):
 
20
  def chunk_text(text, chunk_size=1500):
21
  words = text.split()
22
  for i in range(0, len(words), chunk_size):
23
+ yield " ".join(words[i:i+chunk_size])
24
 
25
  # Models to try (fallbacks)
26
  models_to_try = [
 
30
  "gemini-2.0-flash-lite",
31
  "gemini-2.0-flash",
32
  "gemini-1.5-flash",
33
+ "gemini-1.5-pro",
34
  ]
35
 
36
  # Function: Generate MCQs
 
41
 
42
  for i, chunk in enumerate(chunks, start=1):
43
  prompt = f"""
44
+ Generate 10 MCQs from the following text.
45
+ Return ONLY valid CSV rows with exactly 6 columns:
46
+ Question,OptionA,OptionB,OptionC,OptionD,CorrectAnswer
47
+
48
+ Rules:
49
+ - Do NOT add numbering, quotes, or explanations.
50
+ - Do NOT add headers.
51
+ - Do NOT add extra commas inside cells.
52
+ - Exactly 10 rows per chunk.
53
+
54
+ Text:\n{chunk}
55
+ """
56
 
57
  response = None
58
  for model_name in models_to_try:
 
66
 
67
  if response and response.text:
68
  output = response.text.strip()
69
+ try:
70
+ reader = csv.reader(StringIO(output))
71
+ for row in reader:
72
+ if len(row) >= 6 and row[0]:
73
+ mcq_data.append(row[:6]) # keep only first 6 cols
74
+ except Exception:
75
+ continue
76
 
77
+ if not mcq_data:
 
78
  return None, None
79
 
80
+ df = pd.DataFrame(
81
+ mcq_data,
82
+ columns=["Question", "OptionA", "OptionB", "OptionC", "OptionD", "CorrectAnswer"],
83
+ )
84
+ return df, df.head(10).to_string(index=False)
85
 
86
  # Gradio pipeline
87
  def process_pdf(pdf_file, api_key):
 
106
  # Gradio UI
107
  with gr.Blocks() as demo:
108
  gr.Markdown("## 📘 PDF to MCQ Generator (Gemini AI)")
109
+ gr.Markdown(
110
+ "Upload a PDF, enter your Gemini API key, extract text with OCR, and generate MCQs saved as Excel."
111
+ )
112
 
113
  api_key = gr.Textbox(label="Enter your Gemini API Key", type="password")
114
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
115
  generate_btn = gr.Button("Generate MCQs")
116
+
117
  preview_output = gr.Textbox(label="Preview (First 10 MCQs)", lines=15)
118
  excel_output = gr.File(label="Download Excel (.xlsx)")
119
 
120
  generate_btn.click(
121
  fn=process_pdf,
122
  inputs=[pdf_input, api_key],
123
+ outputs=[preview_output, excel_output],
124
  )
125
 
126
  # Run app