retailpxdemo / utils.py
nicks
Revert
1d8020c
raw
history blame
5.76 kB
import io
import json
from google.cloud import vision
from base import OCROut, OCRTextOut, TextWithLanguage, ocr
from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer
from langdetect import detect
import os
from langdetect.lang_detect_exception import LangDetectException
from typing import Any, Union, List
from glob import glob
from math import sqrt
import pathlib
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
gcp_credentials = os.getenv('gcp_credentials')
creds = json.loads(gcp_credentials)
google_client = vision.ImageAnnotatorClient.from_service_account_info(creds)
def image_to_byte_array(image) -> bytes:
imgByteArr = io.BytesIO()
image.save(imgByteArr, format="png")
image_buffer: bytes = imgByteArr.getvalue()
return image_buffer
def resize_to_megapixels(image, max_megapixels=20):
"""
If input image exceeds the max megapixels limit, resize it to fit in max_megapixels budget
"""
original_x = image.size[0]
original_y = image.size[1]
megapixel = max_megapixels * 1000000
if original_x * original_y > megapixel:
new_x = int((sqrt((original_x * megapixel) / original_y)).real)
new_y = int((sqrt((original_y * megapixel) / original_x)).real)
new_image = image.resize((new_x, new_y))
return new_image
else:
return image
def load_product_json_data(path):
# Loads all the json from the passed path and puts them in a dictionary
paths = glob(path+"*.json")
output_dictionary = {}
for path in paths:
product_name = pathlib.Path(path).stem
f = open(path)
json_content = json.load(f)
output_dictionary[product_name] = json_content
return output_dictionary
def load_product_dataframe_data(path):
# Loads all the json from the passed path and puts them in a dictionary
paths = glob(path + "*.json")
output_dictionary = {}
for path in paths:
product_name = pathlib.Path(path).stem
dataframe = pd.read_json(path, orient = 'index')
output_dictionary[product_name] = dataframe
return output_dictionary
def run_image_ocr(image_bytes: bytes, google_client) -> ocr.Output:
"""OCR for a PNG image."""
# Create a client with the secrets in the local key file
# Send image to API
google_response = google_client.text_detection(
image=vision.Image(content=image_bytes)
)
# Return response
google_response_json = json.loads(
vision.AnnotateImageResponse.to_json(google_response)
)
return ocr.Output(**google_response_json), google_response_json
def get_block_language(block: ocr.Block) -> str:
"""Returns most confident language in block, or "unk" if there is no language"""
if not block.property:
return "unk"
try:
prop = max(block.property.detectedLanguages, key=lambda x: x.confidence)
return prop.languageCode
except Exception:
return "unk"
def block2text(block: ocr.Block) -> TextWithLanguage:
"""
Extract the text from a block.
"""
lang_code = get_block_language(block)
text = ""
for paragraph in block.paragraphs:
for word in paragraph.words:
for symbol in word.symbols:
text += symbol.text
if symbol.property:
if symbol.property.detectedBreak:
text += " "
return TextWithLanguage(text=text.strip(), lang_code=lang_code)
# Takes a list of blocks, and parses the sentences in each block
def get_sentences(blocks: List[TextWithLanguage]) -> List[TextWithLanguage]:
"""
Split the sentences of the blocks and return a list of sentences.
"""
punkt_param = PunktParameters()
punkt_param.abbrev_types = {
"dr",
"vs",
"mr",
"mrs",
"prof",
"inc",
"vit",
"o.a.",
"o.b.v.",
"s.p.",
"m.u.v.",
"i.v.m.",
"a.k.a.",
"e.g.",
"m.b.v.",
"max.",
"min.",
}
sentence_splitter = PunktSentenceTokenizer(punkt_param)
sentences = []
for block in blocks:
for sentence in sentence_splitter.tokenize(block.text):
try:
lang_code = detect(str(sentence).lower())
# Because of lack of context in a sentence
# afrikaans is often recognized
if lang_code == "af":
lang_code = "nl"
except LangDetectException:
lang_code = "unk"
sentences.append(TextWithLanguage(text=str(sentence), lang_code=lang_code))
return sentences
def run_ocr(image_bytes: bytes) -> Union[OCROut, Any]:
# API response is of the type AnnotateImageResponse, see
# https://cloud.google.com/vision/docs/reference/rest/v1/AnnotateImageResponse
# for more details.
ocr_image_annotation, response_json = run_image_ocr(image_bytes, google_client)
# We assume we will only process pictures of one page,
# and no documents of more than one page. Hence, we
# take the first page here.
# Check if the fullTextAnnotations are filled
if ocr_image_annotation.fullTextAnnotation:
ocr_blocks = ocr_image_annotation.fullTextAnnotation.pages[0].blocks
text = ocr_image_annotation.fullTextAnnotation.text
blocks = [block2text(block) for block in ocr_blocks]
block_texts = [block.text for block in blocks]
sentences = get_sentences(blocks)
else:
block_texts = [""]
text = ""
sentences = [TextWithLanguage(text="", lang_code="")]
ocr_text_out = OCRTextOut(blocks=block_texts, full_text=text, sentences=sentences)
return ocr_text_out, response_json