Spaces:

lokesh341
/

datatoeditable

Sleeping

App Files Files Community

lokesh341 commited on Jan 7, 2025

Commit

5eb110a

verified ·

1 Parent(s): cab9f6f

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -15

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import fitz  # PyMuPDF
 from PIL import Image
-import pytesseract
 from fpdf import FPDF
 import gradio as gr
-# Path to Tesseract executable
-pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 # Step 1: Convert PDF Pages to Images
 def pdf_to_images(pdf_path):
@@ -19,16 +19,17 @@ def pdf_to_images(pdf_path):
         images.append(image_path)
     return images
-# Step 2: Extract Text Using Tesseract OCR
-def extract_text(images):
     text_pages = []
     for image_path in images:
-        text = pytesseract.image_to_string(Image.open(image_path))  # Perform OCR
-        text_pages.append(text)
     return text_pages
-# Step 3: Replace Curved Text with Editable Text in a New PDF
-def create_editable_pdf(images, text_pages, output_pdf_path):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     for text in text_pages:
@@ -42,12 +43,12 @@ def process_pdf(file):
     input_pdf_path = file.name
     output_pdf_path = "Editable_Output.pdf"
-    # Convert PDF to images and perform OCR
     images = pdf_to_images(input_pdf_path)
-    text_pages = extract_text(images)
-    # Create a new editable PDF
-    create_editable_pdf(images, text_pages, output_pdf_path)
     return output_pdf_path
 # Gradio Interface
@@ -56,7 +57,7 @@ iface = gr.Interface(
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Editable PDF"),
     title="OCR PDF to Editable Text",
-    description="Upload a PDF to extract and replace text while preserving shapes and layout.",
 )
 iface.launch(share=True)

 import fitz  # PyMuPDF
 from PIL import Image
+import easyocr
 from fpdf import FPDF
 import gradio as gr
+# Initialize EasyOCR reader
+reader = easyocr.Reader(['en'])  # Specify the languages, e.g., 'en' for English
 # Step 1: Convert PDF Pages to Images
 def pdf_to_images(pdf_path):
         images.append(image_path)
     return images
+# Step 2: Extract Text Using EasyOCR
+def extract_text_easyocr(images):
     text_pages = []
     for image_path in images:
+        # Perform OCR on the image
+        text = reader.readtext(image_path, detail=0)  # Extract text without bounding box details
+        text_pages.append("\n".join(text))
     return text_pages
+# Step 3: Create Editable PDF
+def create_editable_pdf(text_pages, output_pdf_path):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     for text in text_pages:
     input_pdf_path = file.name
     output_pdf_path = "Editable_Output.pdf"
+    # Convert PDF to images and extract text
     images = pdf_to_images(input_pdf_path)
+    text_pages = extract_text_easyocr(images)
+    # Create a new PDF with extracted text
+    create_editable_pdf(text_pages, output_pdf_path)
     return output_pdf_path
 # Gradio Interface
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Editable PDF"),
     title="OCR PDF to Editable Text",
+    description="Upload a PDF to extract and replace curved text with editable text.",
 )
 iface.launch(share=True)