167AliRaza commited on
Commit
65956db
Β·
verified Β·
1 Parent(s): 95fb4d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -28
app.py CHANGED
@@ -15,25 +15,17 @@ def configure_gemini_api(api_key: str):
15
  genai.configure(api_key=api_key)
16
  return "βœ… API Key configured successfully!"
17
 
18
- def extract_text_from_pdf(pdf_file) -> str:
19
  """Extract text from PDF using OCR"""
20
  try:
21
- # Create temporary file
22
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
23
- tmp_file.write(pdf_file)
24
- tmp_path = tmp_file.name
25
-
26
  # Convert PDF to images
27
- pages = convert_from_path(tmp_path)
28
  all_text = ""
29
 
30
  for i, page in enumerate(pages):
31
  text = pytesseract.image_to_string(page)
32
  all_text += text + "\n"
33
 
34
- # Clean up temporary file
35
- os.unlink(tmp_path)
36
-
37
  return all_text
38
  except Exception as e:
39
  return f"Error extracting text: {str(e)}"
@@ -124,25 +116,25 @@ def generate_mcqs_from_chunk(chunk: str, api_key: str) -> List[List[str]]:
124
  def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
125
  """Main function to process PDF and generate MCQs"""
126
  if not api_key:
127
- return "❌ Please provide your Gemini API key", ""
128
 
129
  if not pdf_file:
130
- return "❌ Please upload a PDF file", ""
131
 
132
  try:
133
  # Extract text from PDF
134
  progress(0.1, desc="Extracting text from PDF...")
135
- extracted_text = extract_text_from_pdf(pdf_file)
136
 
137
  if extracted_text.startswith("Error"):
138
- return extracted_text, ""
139
 
140
  # Chunk the text
141
  progress(0.2, desc="Chunking text...")
142
  chunks = chunk_text(extracted_text, chunk_size)
143
 
144
  if not chunks:
145
- return "❌ No text could be extracted from the PDF", ""
146
 
147
  # Generate MCQs from each chunk
148
  all_mcq_data = []
@@ -160,22 +152,18 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
160
  progress(0.95, desc="Creating Excel file...")
161
 
162
  if not all_mcq_data:
163
- return "❌ No MCQs could be generated from the PDF content", ""
164
 
165
  # Create DataFrame
166
  df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
167
 
168
- # Create Excel file in memory
169
- output = io.BytesIO()
170
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
171
- df.to_excel(writer, index=False, sheet_name='MCQs')
172
 
173
- output.seek(0)
174
-
175
- # Save to temporary file for download
176
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
177
- temp_file.write(output.getvalue())
178
- temp_file.close()
179
 
180
  progress(1.0, desc="Complete!")
181
 
@@ -184,7 +172,7 @@ def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress
184
  return success_message, temp_file.name
185
 
186
  except Exception as e:
187
- return f"❌ Error processing PDF: {str(e)}", ""
188
 
189
  # Create Gradio interface
190
  def create_interface():
@@ -251,7 +239,7 @@ def create_interface():
251
  outputs=[status_output, download_file],
252
  show_progress=True
253
  ).then(
254
- fn=lambda x: gr.update(visible=bool(x)),
255
  inputs=[download_file],
256
  outputs=[download_file]
257
  )
 
15
  genai.configure(api_key=api_key)
16
  return "βœ… API Key configured successfully!"
17
 
18
+ def extract_text_from_pdf(pdf_file_path: str) -> str:
19
  """Extract text from PDF using OCR"""
20
  try:
 
 
 
 
 
21
  # Convert PDF to images
22
+ pages = convert_from_path(pdf_file_path)
23
  all_text = ""
24
 
25
  for i, page in enumerate(pages):
26
  text = pytesseract.image_to_string(page)
27
  all_text += text + "\n"
28
 
 
 
 
29
  return all_text
30
  except Exception as e:
31
  return f"Error extracting text: {str(e)}"
 
116
  def process_pdf_to_mcqs(pdf_file, api_key: str, chunk_size: int = 1500, progress=gr.Progress()) -> Tuple[str, str]:
117
  """Main function to process PDF and generate MCQs"""
118
  if not api_key:
119
+ return "❌ Please provide your Gemini API key", None
120
 
121
  if not pdf_file:
122
+ return "❌ Please upload a PDF file", None
123
 
124
  try:
125
  # Extract text from PDF
126
  progress(0.1, desc="Extracting text from PDF...")
127
+ extracted_text = extract_text_from_pdf(pdf_file.name)
128
 
129
  if extracted_text.startswith("Error"):
130
+ return extracted_text, None
131
 
132
  # Chunk the text
133
  progress(0.2, desc="Chunking text...")
134
  chunks = chunk_text(extracted_text, chunk_size)
135
 
136
  if not chunks:
137
+ return "❌ No text could be extracted from the PDF", None
138
 
139
  # Generate MCQs from each chunk
140
  all_mcq_data = []
 
152
  progress(0.95, desc="Creating Excel file...")
153
 
154
  if not all_mcq_data:
155
+ return "❌ No MCQs could be generated from the PDF content", None
156
 
157
  # Create DataFrame
158
  df = pd.DataFrame(all_mcq_data, columns=['Question', 'OptionA', 'OptionB', 'OptionC', 'OptionD', 'CorrectAnswer'])
159
 
160
+ # Create temporary Excel file for download
161
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb')
162
+ temp_file.close() # Close to allow pandas to write to it
 
163
 
164
+ # Write Excel file
165
+ with pd.ExcelWriter(temp_file.name, engine='openpyxl') as writer:
166
+ df.to_excel(writer, index=False, sheet_name='MCQs')
 
 
 
167
 
168
  progress(1.0, desc="Complete!")
169
 
 
172
  return success_message, temp_file.name
173
 
174
  except Exception as e:
175
+ return f"❌ Error processing PDF: {str(e)}", None
176
 
177
  # Create Gradio interface
178
  def create_interface():
 
239
  outputs=[status_output, download_file],
240
  show_progress=True
241
  ).then(
242
+ fn=lambda file_path: gr.update(visible=bool(file_path)) if file_path else gr.update(visible=False),
243
  inputs=[download_file],
244
  outputs=[download_file]
245
  )