Chia Woon Yap commited on
Commit
48f37ba
·
verified ·
1 Parent(s): 8b93755

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -0
app.py CHANGED
@@ -184,11 +184,39 @@ def extract_text_from_pptx(pptx_path):
184
  return f"Error extracting text from PowerPoint: {str(e)}"
185
 
186
  # Function to process documents safely
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def process_document(file):
188
  try:
 
 
 
189
  file_extension = os.path.splitext(file.name)[-1].lower()
 
190
  if file_extension in [".png", ".jpg", ".jpeg"]:
191
  return "Error: Images cannot be processed for text extraction."
 
192
  if file_extension == ".pdf":
193
  content = extract_text_from_pdf(file.name)
194
  elif file_extension == ".docx":
@@ -199,14 +227,18 @@ def process_document(file):
199
  encoding = detect_encoding(file.name)
200
  with open(file.name, "r", encoding=encoding, errors="replace") as f:
201
  content = f.read()
 
202
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
203
  documents = [Document(page_content=chunk) for chunk in text_splitter.split_text(content)]
204
  vectorstore.add_documents(documents)
205
  quiz = generate_quiz(content)
206
  return f"Document processed successfully (File Type: {file_extension}). Quiz generated:\n{quiz}"
 
207
  except Exception as e:
208
  return f"Error processing document: {str(e)}"
209
 
 
 
210
  # Function to handle speech-to-text conversion
211
  def transcribe_audio(audio):
212
  sr, y = audio
 
184
  return f"Error extracting text from PowerPoint: {str(e)}"
185
 
186
  # Function to process documents safely
187
+ #def process_document(file):
188
+ # try:
189
+ # file_extension = os.path.splitext(file.name)[-1].lower()
190
+ # if file_extension in [".png", ".jpg", ".jpeg"]:
191
+ # return "Error: Images cannot be processed for text extraction."
192
+ # if file_extension == ".pdf":
193
+ # content = extract_text_from_pdf(file.name)
194
+ # elif file_extension == ".docx":
195
+ # content = extract_text_from_docx(file.name)
196
+ # elif file_extension == ".pptx":
197
+ # content = extract_text_from_pptx(file.name)
198
+ # else:
199
+ # encoding = detect_encoding(file.name)
200
+ # with open(file.name, "r", encoding=encoding, errors="replace") as f:
201
+ # content = f.read()
202
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
203
+ # documents = [Document(page_content=chunk) for chunk in text_splitter.split_text(content)]
204
+ # vectorstore.add_documents(documents)
205
+ # quiz = generate_quiz(content)
206
+ # return f"Document processed successfully (File Type: {file_extension}). Quiz generated:\n{quiz}"
207
+ # except Exception as e:
208
+ # return f"Error processing document: {str(e)}"
209
+
210
  def process_document(file):
211
  try:
212
+ if not file or not hasattr(file, "name") or not isinstance(file.name, str):
213
+ return "Error: Invalid file uploaded."
214
+
215
  file_extension = os.path.splitext(file.name)[-1].lower()
216
+
217
  if file_extension in [".png", ".jpg", ".jpeg"]:
218
  return "Error: Images cannot be processed for text extraction."
219
+
220
  if file_extension == ".pdf":
221
  content = extract_text_from_pdf(file.name)
222
  elif file_extension == ".docx":
 
227
  encoding = detect_encoding(file.name)
228
  with open(file.name, "r", encoding=encoding, errors="replace") as f:
229
  content = f.read()
230
+
231
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
232
  documents = [Document(page_content=chunk) for chunk in text_splitter.split_text(content)]
233
  vectorstore.add_documents(documents)
234
  quiz = generate_quiz(content)
235
  return f"Document processed successfully (File Type: {file_extension}). Quiz generated:\n{quiz}"
236
+
237
  except Exception as e:
238
  return f"Error processing document: {str(e)}"
239
 
240
+
241
+
242
  # Function to handle speech-to-text conversion
243
  def transcribe_audio(audio):
244
  sr, y = audio