prat1003 commited on
Commit
4b1b7b9
·
verified ·
1 Parent(s): 2085bbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -27,7 +27,7 @@ def extract_text_from_pdf(file_path):
27
  reader_pdf = PdfReader(file_path)
28
  text = ""
29
  for page in reader_pdf.pages:
30
- t = page.extract_text()
31
  if t:
32
  text += t + "\n"
33
  return text.strip()
@@ -40,9 +40,12 @@ def extract_text_from_scanned_pdf(file_path):
40
  pages = convert_from_path(file_path, dpi=150)
41
  text = ""
42
  for page in pages:
43
- img_array = np.array(page)
44
- result = reader.readtext(img_array, detail=0)
45
- text += " ".join(result) + "\n"
 
 
 
46
  return text.strip()
47
 
48
  # -----------------------------
 
27
  reader_pdf = PdfReader(file_path)
28
  text = ""
29
  for page in reader_pdf.pages:
30
+ t = getattr(page, 'extract_text', lambda: None)()
31
  if t:
32
  text += t + "\n"
33
  return text.strip()
 
40
  pages = convert_from_path(file_path, dpi=150)
41
  text = ""
42
  for page in pages:
43
+ try:
44
+ img_array = np.array(page)
45
+ result = reader.readtext(img_array, detail=0)
46
+ text += " ".join(result) + "\n"
47
+ except Exception as e:
48
+ print("OCR error on page:", e)
49
  return text.strip()
50
 
51
  # -----------------------------