Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
| 1 |
-
import fitz # PyMuPDF
|
| 2 |
from PIL import Image
|
| 3 |
import pytesseract
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def extract_images_from_pdf(pdf_path):
|
| 6 |
"""
|
| 7 |
-
Extract images from
|
| 8 |
"""
|
| 9 |
images = []
|
| 10 |
document = fitz.open(pdf_path)
|
|
@@ -17,7 +21,7 @@ def extract_images_from_pdf(pdf_path):
|
|
| 17 |
|
| 18 |
def perform_ocr_on_images(images):
|
| 19 |
"""
|
| 20 |
-
Perform OCR on the extracted images.
|
| 21 |
"""
|
| 22 |
ocr_results = []
|
| 23 |
for img in images:
|
|
@@ -25,15 +29,23 @@ def perform_ocr_on_images(images):
|
|
| 25 |
ocr_results.append(text)
|
| 26 |
return "\n".join(ocr_results)
|
| 27 |
|
| 28 |
-
def ocr_marathi_from_pdf(
|
| 29 |
"""
|
| 30 |
-
Main function to handle Marathi OCR from a PDF.
|
| 31 |
"""
|
| 32 |
-
images = extract_images_from_pdf(
|
| 33 |
ocr_text = perform_ocr_on_images(images)
|
| 34 |
return ocr_text
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if __name__ == "__main__":
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
print(ocr_text)
|
|
|
|
| 1 |
+
import fitz # PyMuPDF for PDF processing
|
| 2 |
from PIL import Image
|
| 3 |
import pytesseract
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
# Ensure Tesseract is configured with Marathi language support
|
| 7 |
+
# Install Marathi language: sudo apt-get install tesseract-ocr-mar
|
| 8 |
|
| 9 |
def extract_images_from_pdf(pdf_path):
|
| 10 |
"""
|
| 11 |
+
Extract images from a PDF file using PyMuPDF.
|
| 12 |
"""
|
| 13 |
images = []
|
| 14 |
document = fitz.open(pdf_path)
|
|
|
|
| 21 |
|
| 22 |
def perform_ocr_on_images(images):
|
| 23 |
"""
|
| 24 |
+
Perform OCR on the extracted images using pytesseract for Marathi text.
|
| 25 |
"""
|
| 26 |
ocr_results = []
|
| 27 |
for img in images:
|
|
|
|
| 29 |
ocr_results.append(text)
|
| 30 |
return "\n".join(ocr_results)
|
| 31 |
|
| 32 |
+
def ocr_marathi_from_pdf(pdf_file):
|
| 33 |
"""
|
| 34 |
+
Main function to handle Marathi OCR from a PDF file.
|
| 35 |
"""
|
| 36 |
+
images = extract_images_from_pdf(pdf_file.name) # Use the file path from the upload
|
| 37 |
ocr_text = perform_ocr_on_images(images)
|
| 38 |
return ocr_text
|
| 39 |
|
| 40 |
+
# Define the Gradio interface
|
| 41 |
+
interface = gr.Interface(
|
| 42 |
+
fn=ocr_marathi_from_pdf,
|
| 43 |
+
inputs=gr.File(type="file", label="Upload Marathi PDF"),
|
| 44 |
+
outputs=gr.Textbox(label="Extracted Marathi Text"),
|
| 45 |
+
title="Marathi PDF OCR",
|
| 46 |
+
description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
if __name__ == "__main__":
|
| 50 |
+
interface.launch()
|
| 51 |
+
|
|
|