Docgenie-API / docgenie /generation /pipeline_19_create_debug_data.py
Ahadhassan-2003
deploy: update HF Space
dc4e6da
import json
import pathlib
import shutil
import fitz
from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
from docgenie.generation.models import (
OCRBox,
PipelineParameters,
SynDatasetDefinition,
SynDocumentLog,
)
from rich.progress import (
Progress,
TimeElapsedColumn,
BarColumn,
TaskProgressColumn,
TimeRemainingColumn,
)
from PIL import Image
from docgenie.generation.utils.bboxes import (
draw_bboxes_on_image,
draw_bboxes_on_pdf,
read_syn_dataset_bboxes,
save_bboxes,
)
from docgenie.generation.utils.geos import rect_to_ocrbox
from docgenie.generation.utils.log import log_pipeline_level
from docgenie.generation.utils.status import get_progress_bar
def mm_to_px(mm: int | float):
return mm * 72 / 25.4
def draw_visual_elements_debug(dsdef: SynDatasetDefinition, docid: str):
dsfiles = dsdef.get_file_structure()
bboxes = []
data_path = dsfiles.visual_element_definitions_directory / f"{docid}.json"
data = json.loads(data_path.read_text(encoding="utf-8"))
for d in data:
if d["error"] is None:
b = rect_to_ocrbox(d["rect"])
bboxes.append(b)
draw_bboxes_on_pdf(
dsfiles.final_pdf_directory / f"{docid}.pdf",
dsfiles.debug_pdf_visual_elements_directory / f"{docid}.pdf",
bboxes,
color=(0, 0, 1), # visual elements blue
)
def unnormalize_bboxes(bboxes: list[OCRBox], width: float, height: float):
for b in bboxes:
yield OCRBox(
x0=b.x0 * width,
y0=b.y0 * height,
x2=b.x2 * width,
y2=b.y2 * height,
text=b.text,
block_no=b.block_no,
line_no=b.line_no,
word_no=b.word_no,
)
def draw_bbox_final_debug(dsdef: SynDatasetDefinition, docid: str):
dsfiles = dsdef.get_file_structure()
bbox_norm_path = dsfiles.get_final_normalized_bbox_path(
level="segment", doc_id=docid
)
bbox_norm = read_syn_dataset_bboxes(bbox_norm_path)
img_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
img = Image.open(img_path)
width, height = img.size
bbox_unnorm = list(unnormalize_bboxes(bboxes=bbox_norm, width=width, height=height))
try:
img_altered = draw_bboxes_on_image(img, bbox_unnorm, show_text=True)
img_altered.save(
dsfiles.debug_pdf_bboxes_final_directory / f"{docid}.{IMAGE_RENDER_EXT}"
)
except Exception as err:
print(f"[ERROR]: Skipping debug PDF: {str(err)}")
def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str):
dsfiles = dsdef.get_file_structure()
bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid)
bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path)
pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf"
outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf"
try:
draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm)
except Exception as err:
print(f"[ERROR]: Skipping debug PDF: {str(err)}")
def pipeline_create_debug_data(params: PipelineParameters):
log_pipeline_level()
dsdef = params.dsdef
dsfiles = dsdef.get_file_structure()
# Get valid documents
valid_documents = []
total_pdfs_count = 0
for doclog in dsdef.get_document_logs():
total_pdfs_count += 1
if doclog.pdf_num_pages == 1:
valid_documents.append(doclog)
print(f"Found {len(valid_documents)} documents valid for debug PDF/Img drawing.")
with get_progress_bar() as progress:
task = progress.add_task(
"[white]Draw Debug PDF/Images...", total=len(valid_documents)
)
doclog: SynDocumentLog
for doclog in valid_documents:
docid = doclog.document_id
# Copy raw HTML to debug directory
src = dsfiles.raw_html_directory / f"{docid}.html"
tgt = dsfiles.debug_html_raw_directory / f"{docid}.html"
shutil.copy(src, tgt)
if doclog.visual_elements_num_elements > 0:
draw_visual_elements_debug(dsdef=dsdef, docid=docid)
# Handwriting debug is created when handwriting is inserted
if doclog.ocr_found:
draw_bbox_final_debug(dsdef=dsdef, docid=docid)
progress.update(task, advance=1)
# Copy debug script into debug html directory
debug_script_fname = "debug.js"
src_dir = pathlib.Path(__file__).parent
src_path = src_dir / debug_script_fname
dst_path = dsfiles.debug_html_raw_directory / debug_script_fname
shutil.copy(src_path, dst_path)