lokesh341 commited on
Commit
5eb110a
·
verified ·
1 Parent(s): cab9f6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import fitz # PyMuPDF
2
  from PIL import Image
3
- import pytesseract
4
  from fpdf import FPDF
5
  import gradio as gr
6
 
7
- # Path to Tesseract executable
8
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
9
 
10
  # Step 1: Convert PDF Pages to Images
11
  def pdf_to_images(pdf_path):
@@ -19,16 +19,17 @@ def pdf_to_images(pdf_path):
19
  images.append(image_path)
20
  return images
21
 
22
- # Step 2: Extract Text Using Tesseract OCR
23
- def extract_text(images):
24
  text_pages = []
25
  for image_path in images:
26
- text = pytesseract.image_to_string(Image.open(image_path)) # Perform OCR
27
- text_pages.append(text)
 
28
  return text_pages
29
 
30
- # Step 3: Replace Curved Text with Editable Text in a New PDF
31
- def create_editable_pdf(images, text_pages, output_pdf_path):
32
  pdf = FPDF()
33
  pdf.set_auto_page_break(auto=True, margin=15)
34
  for text in text_pages:
@@ -42,12 +43,12 @@ def process_pdf(file):
42
  input_pdf_path = file.name
43
  output_pdf_path = "Editable_Output.pdf"
44
 
45
- # Convert PDF to images and perform OCR
46
  images = pdf_to_images(input_pdf_path)
47
- text_pages = extract_text(images)
48
-
49
- # Create a new editable PDF
50
- create_editable_pdf(images, text_pages, output_pdf_path)
51
  return output_pdf_path
52
 
53
  # Gradio Interface
@@ -56,7 +57,7 @@ iface = gr.Interface(
56
  inputs=gr.File(label="Upload PDF"),
57
  outputs=gr.File(label="Download Editable PDF"),
58
  title="OCR PDF to Editable Text",
59
- description="Upload a PDF to extract and replace text while preserving shapes and layout.",
60
  )
61
 
62
  iface.launch(share=True)
 
1
  import fitz # PyMuPDF
2
  from PIL import Image
3
+ import easyocr
4
  from fpdf import FPDF
5
  import gradio as gr
6
 
7
+ # Initialize EasyOCR reader
8
+ reader = easyocr.Reader(['en']) # Specify the languages, e.g., 'en' for English
9
 
10
  # Step 1: Convert PDF Pages to Images
11
  def pdf_to_images(pdf_path):
 
19
  images.append(image_path)
20
  return images
21
 
22
+ # Step 2: Extract Text Using EasyOCR
23
+ def extract_text_easyocr(images):
24
  text_pages = []
25
  for image_path in images:
26
+ # Perform OCR on the image
27
+ text = reader.readtext(image_path, detail=0) # Extract text without bounding box details
28
+ text_pages.append("\n".join(text))
29
  return text_pages
30
 
31
+ # Step 3: Create Editable PDF
32
+ def create_editable_pdf(text_pages, output_pdf_path):
33
  pdf = FPDF()
34
  pdf.set_auto_page_break(auto=True, margin=15)
35
  for text in text_pages:
 
43
  input_pdf_path = file.name
44
  output_pdf_path = "Editable_Output.pdf"
45
 
46
+ # Convert PDF to images and extract text
47
  images = pdf_to_images(input_pdf_path)
48
+ text_pages = extract_text_easyocr(images)
49
+
50
+ # Create a new PDF with extracted text
51
+ create_editable_pdf(text_pages, output_pdf_path)
52
  return output_pdf_path
53
 
54
  # Gradio Interface
 
57
  inputs=gr.File(label="Upload PDF"),
58
  outputs=gr.File(label="Download Editable PDF"),
59
  title="OCR PDF to Editable Text",
60
+ description="Upload a PDF to extract and replace curved text with editable text.",
61
  )
62
 
63
  iface.launch(share=True)