Spaces:

Text-to-Document-Generation
/

Docgenie-API

Paused

File size: 3,122 Bytes

dc4e6da

import os
from pathlib import Path

import numpy as np
import requests

from docgenie.generation.utils.image import img_read, img_write_to_bytes
from docgenie.utils.ocr import MicrosoftOCR, MicrosoftOCRWord

OCR_ENGINE = 'microsoft_di'
OCR_PORT_ENV = os.getenv('DOCGENIE_OCR_PORT')
OCR_PORT = OCR_PORT_ENV or '8000'
OCR_URL = 'http://localhost:' + OCR_PORT
OCR_POSTFIX = '0.MicrosoftOcrService.json'


def get_ocr_cache_path(image_path: Path, postfix: str) -> Path:
    return image_path.parent / f'{image_path.name}.{postfix}'


def call_ocr_service_from_image(image: np.ndarray,

                                url: str = OCR_URL,

                                engine: str = OCR_ENGINE,

                                client_caching: bool = True,

                                image_path: Path | None = None) -> MicrosoftOCR:
    headers = {'accept': 'application/json'}

    cache_path = None
    if client_caching:
        cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX)
        if cache_path.exists():
            return MicrosoftOCR.load_from_file(cache_path)

    encoded_image = img_write_to_bytes(image)
    files = {'image': encoded_image, 'type': 'image/png'}
    endpoint = f'{url}/v1/sync/ocr/{engine}'
    response = requests.post(url=endpoint, headers=headers, files=files)
    response.raise_for_status()

    data = response.json()
    first_page = data['ocr']['pages'][0]
    ocr = MicrosoftOCR(
        angle=first_page['angle'],
        width=first_page['imageWidth'],
        height=first_page['imageHeight'],
        words=[
            MicrosoftOCRWord(
                text=proto['text'],
                confidence=proto['confidence'],
                geo=proto['geo']
            )
            for proto in first_page['words']
        ],
        lines=[
            MicrosoftOCRWord(
                text=proto['text'],
                confidence=proto['confidence'],
                geo=proto['geo']
            )
            for proto in first_page['lines']
        ],
    )

    if client_caching and cache_path:
        ocr.save_to_file(cache_path)

    return ocr


def call_ocr_service_from_file(image_path: Path,

                               url: str = OCR_URL,

                               engine: str = OCR_ENGINE,

                               client_caching: bool = True) -> MicrosoftOCR:
    if client_caching:
        cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX)
        if cache_path.exists():
            return MicrosoftOCR.load_from_file(cache_path)

    image = img_read(image_path)
    return call_ocr_service_from_image(image, url, engine, client_caching=client_caching, image_path=image_path)


if __name__ == '__main__':
    base_dir = Path("data/temp/OCR/test-dataset")
    image_file = base_dir / "04276b91-eb12-4b47-80a6-666f6d09b6ce_1.jpg"

    # client_caching: True will also write the OCR file next to the image.
    ocr: MicrosoftOCR = call_ocr_service_from_file(image_file, client_caching=True)
    # ocr.save_to_file(...)
    print(ocr.words)