Spaces:
Sleeping
Sleeping
Update read_photodocument.py
Browse files- read_photodocument.py +15 -0
read_photodocument.py
CHANGED
|
@@ -19,6 +19,8 @@ from spellchecker import SpellChecker
|
|
| 19 |
from tqdm.auto import tqdm
|
| 20 |
import nltk
|
| 21 |
import contextlib
|
|
|
|
|
|
|
| 22 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
| 23 |
|
| 24 |
|
|
@@ -28,6 +30,19 @@ def simple_rename(filepath, target_ext=".txt"):
|
|
| 28 |
return f"OCR_{basename}_{target_ext}"
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def rm_local_text_files(name_contains="RESULT_"):
|
| 32 |
"""
|
| 33 |
rm_local_text_files - remove local text files
|
|
|
|
| 19 |
from tqdm.auto import tqdm
|
| 20 |
import nltk
|
| 21 |
import contextlib
|
| 22 |
+
import img2pdf
|
| 23 |
+
from PIL import Image
|
| 24 |
nltk.download("stopwords") # TODO=find where this requirement originates from
|
| 25 |
|
| 26 |
|
|
|
|
| 30 |
return f"OCR_{basename}_{target_ext}"
|
| 31 |
|
| 32 |
|
| 33 |
+
def convert_image_to_pdf(image_path,model=None):
|
| 34 |
+
pdf_path = image_path.replace('.jpg','.pdf')
|
| 35 |
+
image = Image.open(image_path)
|
| 36 |
+
pdf_bytes = img2pdf.convert(image.filename)
|
| 37 |
+
file = open(pdf_path, "wb")
|
| 38 |
+
file.write(pdf_bytes)
|
| 39 |
+
image.close()
|
| 40 |
+
file.close()
|
| 41 |
+
return convert_PDF_to_Text(PDF_file=pdf_path,ocr_model=model)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
def rm_local_text_files(name_contains="RESULT_"):
|
| 47 |
"""
|
| 48 |
rm_local_text_files - remove local text files
|