prat1003 commited on
Commit
e7b5f58
·
verified ·
1 Parent(s): 68ecf5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -6
app.py CHANGED
@@ -7,6 +7,24 @@ import tempfile
7
  import shutil
8
  import os
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # 🧠 Load lightweight question generation model
11
  qg_pipeline = pipeline(
12
  "text2text-generation",
@@ -15,12 +33,12 @@ qg_pipeline = pipeline(
15
  )
16
 
17
  # 🧩 OCR function: extract text from scanned PDFs
18
- def extract_text_from_scanned_pdf(file_path):
19
- pages = convert_from_path(file_path)
20
- text = ""
21
- for page in pages:
22
- text += pytesseract.image_to_string(page)
23
- return text.strip()
24
 
25
  # ⚙️ Main processing function
26
  def process_pdf(pdf_file):
 
7
  import shutil
8
  import os
9
 
10
+
11
+ import easyocr
12
+ import numpy as np
13
+
14
+ reader = easyocr.Reader(['en'])
15
+
16
+ def extract_text_from_scanned_pdf(file_path):
17
+ pages = convert_from_path(file_path)
18
+ text = ""
19
+ for page in pages:
20
+ img_array = np.array(page)
21
+ result = reader.readtext(img_array, detail=0)
22
+ text += " ".join(result) + "\n"
23
+ return text.strip()
24
+
25
+
26
+
27
+
28
  # 🧠 Load lightweight question generation model
29
  qg_pipeline = pipeline(
30
  "text2text-generation",
 
33
  )
34
 
35
  # 🧩 OCR function: extract text from scanned PDFs
36
+ #def extract_text_from_scanned_pdf(file_path):
37
+ # pages = convert_from_path(file_path)
38
+ # text = ""
39
+ # for page in pages:
40
+ # text += pytesseract.image_to_string(page)
41
+ # return text.strip()
42
 
43
  # ⚙️ Main processing function
44
  def process_pdf(pdf_file):