Deevyankar commited on
Commit
42a8447
·
verified ·
1 Parent(s): 6310783

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -7,7 +7,8 @@ import io
7
 
8
  def extract_text_from_pdf(uploaded_file):
9
  try:
10
- file_bytes = uploaded_file.read()
 
11
  doc = fitz.open(stream=file_bytes, filetype="pdf")
12
  text = ""
13
  for page in doc:
@@ -16,31 +17,30 @@ def extract_text_from_pdf(uploaded_file):
16
  text += page_text + "\n"
17
  return text.strip()
18
  except Exception as e:
19
- return f"Error: {str(e)}"
20
 
21
  def extract_los(lo_file):
22
- name = getattr(lo_file, "name", "")
23
- ext = name.lower().split('.')[-1]
 
 
24
 
25
- if ext == "txt":
26
- return lo_file.read().decode("utf-8").splitlines()
27
- elif ext == "docx":
28
- file_bytes = io.BytesIO(lo_file.read())
29
- doc = Document(file_bytes)
30
- return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
31
- else:
32
- return []
 
 
33
 
34
  def compare_handouts(old_pdf, new_pdf, lo_file):
35
  old_text = extract_text_from_pdf(old_pdf)
36
  new_text = extract_text_from_pdf(new_pdf)
37
  los = extract_los(lo_file)
38
- print(old_text)
39
- print("\n")
40
- print(new_text)
41
- print("\n")
42
- print(los)
43
-
44
  if not old_text or not new_text:
45
  return "❗ One or both PDFs may not contain extractable text."
46
 
 
7
 
8
  def extract_text_from_pdf(uploaded_file):
9
  try:
10
+ # Fix: handle both bytes and file-like
11
+ file_bytes = uploaded_file if isinstance(uploaded_file, bytes) else uploaded_file.read()
12
  doc = fitz.open(stream=file_bytes, filetype="pdf")
13
  text = ""
14
  for page in doc:
 
17
  text += page_text + "\n"
18
  return text.strip()
19
  except Exception as e:
20
+ return f"Error extracting text: {str(e)}"
21
 
22
  def extract_los(lo_file):
23
+ try:
24
+ file_bytes = lo_file if isinstance(lo_file, bytes) else lo_file.read()
25
+ name = getattr(lo_file, "name", "")
26
+ ext = name.lower().split('.')[-1] if name else "docx"
27
 
28
+ if ext == "txt":
29
+ return file_bytes.decode("utf-8").splitlines()
30
+ elif ext == "docx":
31
+ file_stream = io.BytesIO(file_bytes)
32
+ doc = Document(file_stream)
33
+ return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
34
+ else:
35
+ return []
36
+ except Exception as e:
37
+ return [f"Error loading LOs: {str(e)}"]
38
 
39
  def compare_handouts(old_pdf, new_pdf, lo_file):
40
  old_text = extract_text_from_pdf(old_pdf)
41
  new_text = extract_text_from_pdf(new_pdf)
42
  los = extract_los(lo_file)
43
+
 
 
 
 
 
44
  if not old_text or not new_text:
45
  return "❗ One or both PDFs may not contain extractable text."
46