Spaces:

lokesh341
/

datatoeditable

Sleeping

App Files Files Community

lokesh341 commited on Jan 7, 2025

Commit

06149b8

verified ·

1 Parent(s): c93881c

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -52

app.py CHANGED Viewed

@@ -1,59 +1,56 @@
 import fitz  # PyMuPDF
 from PIL import Image
-import pytesseract
 from fpdf import FPDF
 import gradio as gr
-# Step 1: Convert PDF Pages to Images
-def pdf_to_images(pdf_path):
-    pdf_document = fitz.open(pdf_path)
-    images = []
-    for page_num in range(len(pdf_document)):
-        page = pdf_document[page_num]
-        pix = page.get_pixmap()  # Render the page as an image
-        image_path = f"page_{page_num + 1}.png"
-        pix.save(image_path)
-        images.append(image_path)
-    return images
-# Step 2: Extract Text Using Tesseract OCR
-def extract_text(images):
-    text_pages = []
-    for image_path in images:
-        text = pytesseract.image_to_string(Image.open(image_path))  # Perform OCR
-        text_pages.append(text)
-    return text_pages
-# Step 3: Replace Curved Text with Editable Text in a New PDF
-def create_editable_pdf(images, text_pages, output_pdf_path):
-    pdf = FPDF()
-    pdf.set_auto_page_break(auto=True, margin=15)
-    for text in text_pages:
-        pdf.add_page()
-        pdf.set_font("Arial", size=12)
-        pdf.multi_cell(0, 10, text)
-    pdf.output(output_pdf_path)
-# Main Function
-def process_pdf(file):
-    input_pdf_path = file.name
-    output_pdf_path = "Editable_Output.pdf"
-    # Convert PDF to images and perform OCR
-    images = pdf_to_images(input_pdf_path)
-    text_pages = extract_text(images)
-    # Create a new editable PDF
-    create_editable_pdf(images, text_pages, output_pdf_path)
-    return output_pdf_path
-# Gradio Interface
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=gr.File(label="Download Editable PDF"),
-    title="OCR PDF to Editable Text",
-    description="Upload a PDF to extract and replace text while preserving shapes and layout.",
 )
-iface.launch()

 import fitz  # PyMuPDF
+from pytesseract import image_to_string
 from PIL import Image
 from fpdf import FPDF
 import gradio as gr
+# Function to process PDF
+def process_pdf(pdf_file):
+    # Open the PDF
+    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    processed_pages = []
+    for page_number in range(len(pdf_document)):
+        # Extract page as an image
+        page = pdf_document.load_page(page_number)
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Perform OCR on the image
+        text = image_to_string(img)
+        # Save the processed text back to a new PDF with shapes preserved
+        processed_pages.append({"image": img, "text": text})
+    # Create a new PDF with editable text
+    pdf_output = FPDF()
+    pdf_output.set_auto_page_break(auto=True, margin=15)
+    for page_data in processed_pages:
+        pdf_output.add_page()
+        pdf_output.set_font("Arial", size=12)
+        pdf_output.multi_cell(0, 10, page_data["text"])
+    # Save the output PDF
+    output_file = "processed_output.pdf"
+    pdf_output.output(output_file)
+    return output_file
+# Gradio interface
+def process_pdf_interface(pdf_file):
+    output_file = process_pdf(pdf_file)
+    return output_file
+# Create Gradio App
+interface = gr.Interface(
+    fn=process_pdf_interface,
+    inputs=gr.inputs.File(label="Upload your PDF"),
+    outputs=gr.outputs.File(label="Download Processed PDF"),
+    title="PDF Text Processing App",
+    description="Upload a PDF to process its text, replace curved text with editable text, and download the updated PDF."
 )
+# Launch Gradio app
+if __name__ == "__main__":
+    interface.launch()