ocr-tesseract / app.py
congnx1809's picture
congnx
72bd8e3
import hashlib
import os
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path, PurePath
from typing import List, Union
import cv2
import gradio as gr
import img2pdf
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
cache = {}
@dataclass
class MetaData:
"""
Class to store metadata for each processed pdf file.
"""
image_path: str = None
xlsx_path: str = None
cache = defaultdict(MetaData)
def get_latest_file(directory, pattern="*"):
files = list(Path(directory).glob(pattern))
if not files:
return None
latest_file = max(files, key=os.path.getmtime)
return latest_file.absolute()
def convert_images_to_pdf(image_paths: str, output_pdf_path: str):
"""
Convert images to PDF using img2pdf for better quality preservation.
Args:
image_paths (list): List of paths to image files
output_pdf_path (str): Path where the output PDF will be saved
"""
# Check if the list is empty
if not image_paths:
print("No images provided!")
return
# Convert images to PDF
with open(output_pdf_path, "wb") as f:
f.write(
img2pdf.convert(
[
i
for i in image_paths
if i.lower().endswith((".png", ".jpg", ".jpeg", ".tiff", ".bmp"))
]
)
)
def hash_file(filepath) -> str:
"""Generate a hash for the file at the given path."""
hasher = hashlib.md5()
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
def convert_txt(pdf_paths: List[str], is_save_image: bool = False):
"""
Convert a list of PDF paths to a list of image paths.
:param pdf_paths: List of PDF paths
:param is_save_image: Whether to save the images or not
"""
text_filepaths = []
for pdf_path in pdf_paths:
pdf_path = Path(pdf_path)
suffix = PurePath(pdf_path).stem
df = pd.DataFrame(columns=["text", "page"])
# Generate a hash for the PDF file, store it in cache and check if it exists
pdf_hash = hash_file(pdf_path)
print(f"Hash for {pdf_path}: {pdf_hash}")
if pdf_hash in cache.keys():
print(f"Skipping {pdf_path}, already processed.")
continue
else:
cache[pdf_hash] = MetaData()
# Convert PDF to images
images = convert_from_path(pdf_path)
image_folder_path = Path(f"./tmp/{suffix}")
text_folder_path = Path(f"./text/{suffix}")
cache[pdf_hash].image_path = str(image_folder_path.absolute())
if not text_folder_path.exists():
os.makedirs(text_folder_path, exist_ok=True)
if not image_folder_path.exists():
os.makedirs(image_folder_path, exist_ok=True)
for i, image in enumerate(images):
image_path = image_folder_path / f"{suffix}_{i + 1}.jpg"
image.save(image_path)
if i > 1:
image = cv2.imread(str(image_path))
image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
text_filepath = text_folder_path / f"{suffix}_{i + 1}.txt"
txt = pytesseract.image_to_string(
image, lang="vie", config="--oem 1 --psm 6"
)
text_filepath.write_text(txt, encoding="utf-8")
text_filepaths.append(str(text_filepath.absolute()))
df = pd.concat([df, pd.DataFrame({"text": [txt], "page": [i + 1]})])
# del images
return text_filepaths
# # if not is_save_image:
# # os.remove(str(image_folder_path.absolute()))
#
# excel_path = Path(f"./excel/{suffix}.xlsx")
# if not excel_path.exists():
# os.makedirs(excel_path.parent, exist_ok=True)
#
# cache[pdf_hash].xlsx_path = str(excel_path.absolute())
# print(f"Saving {pdf_path} to {excel_path}")
# df.to_excel(str(excel_path), index=False)
def filter_by_keyword(keywords: Union[str, List[str]], hash_id: str = ""):
"""
Filter the text in the Excel file by keyword.
:param keyword: Keyword to filter by
"""
if isinstance(keywords, str):
keywords = [keywords]
page_id_folder = Path("./page_id")
if not os.path.exists(page_id_folder):
os.makedirs(page_id_folder, exist_ok=True)
if hash_id != "":
excel_path = get_latest_file(Path("./excel"), pattern="*.xlsx")
else:
excel_path = cache[hash_id].xlsx_path
print(f"-------Excel path --------- {excel_path}")
df = pd.read_excel(str(excel_path.absolute()))
page_id_path = page_id_folder / f"{excel_path.stem}.txt"
with (page_id_path).open("w+") as f:
for k in keywords:
f.write(f"\n{k}\n")
for _, row in df.iterrows():
text = row["text"]
if isinstance(text, str) and k.lower() in text.lower():
f.write(f"{row['page']}\n")
content = page_id_path.read_text()
return content
def gradio_interface(file, keyword=None):
"""
Gradio interface for the PDF processing and filtering.
:param file: Uploaded PDF file
:return: Path to the filtered text file
"""
pdf_path = file.name
hash_id = hash_file(pdf_path)
if hash_id in cache.items():
print(f"Skipping {pdf_path}, already processed.")
else:
filepaths = convert_txt([pdf_path])
if keyword:
content = filter_by_keyword(keyword, hash_id)
return filepaths
# return content
if __name__ == "__main__":
os.system(
"apt-get update && apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-vie"
)
os.system("pip install -q pytesseract openpyxl")
demo = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(label="Keyword"),
],
# outputs=gr.Textbox(label="Filtered Text"),
outputs=gr.Files(label="Filtered Text File"),
title="PDF Keyword Filter",
description="Upload a PDF file and enter a keyword to filter the text.",
)
demo.launch(debug=True)