Spaces:

prat1003
/

project2

Sleeping

prat1003 commited on Oct 13, 2025

Commit

e7b5f58

verified ·

1 Parent(s): 68ecf5d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,6 +7,24 @@ import tempfile
 import shutil
 import os
 # 🧠 Load lightweight question generation model
 qg_pipeline = pipeline(
     "text2text-generation",
@@ -15,12 +33,12 @@ qg_pipeline = pipeline(
 )
 # 🧩 OCR function: extract text from scanned PDFs
-def extract_text_from_scanned_pdf(file_path):
-    pages = convert_from_path(file_path)
-    text = ""
-    for page in pages:
-        text += pytesseract.image_to_string(page)
-    return text.strip()
 # ⚙️ Main processing function
 def process_pdf(pdf_file):

 import shutil
 import os
+import easyocr
+import numpy as np
+reader = easyocr.Reader(['en'])
+def extract_text_from_scanned_pdf(file_path):
+    pages = convert_from_path(file_path)
+    text = ""
+    for page in pages:
+        img_array = np.array(page)
+        result = reader.readtext(img_array, detail=0)
+        text += " ".join(result) + "\n"
+    return text.strip()
 # 🧠 Load lightweight question generation model
 qg_pipeline = pipeline(
     "text2text-generation",
 )
 # 🧩 OCR function: extract text from scanned PDFs
+#def extract_text_from_scanned_pdf(file_path):
+#   pages = convert_from_path(file_path)
+#    text = ""
+#    for page in pages:
+#        text += pytesseract.image_to_string(page)
+#   return text.strip()
 # ⚙️ Main processing function
 def process_pdf(pdf_file):