import io import json from google.cloud import vision from base import OCROut, OCRTextOut, TextWithLanguage, ocr from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer from langdetect import detect import os from langdetect.lang_detect_exception import LangDetectException from typing import Any, Union, List from glob import glob from math import sqrt import pathlib import pandas as pd from dotenv import load_dotenv load_dotenv() gcp_credentials = os.getenv('gcp_credentials') creds = json.loads(gcp_credentials) google_client = vision.ImageAnnotatorClient.from_service_account_info(creds) def image_to_byte_array(image) -> bytes: imgByteArr = io.BytesIO() image.save(imgByteArr, format="png") image_buffer: bytes = imgByteArr.getvalue() return image_buffer def resize_to_megapixels(image, max_megapixels=20): """ If input image exceeds the max megapixels limit, resize it to fit in max_megapixels budget """ original_x = image.size[0] original_y = image.size[1] megapixel = max_megapixels * 1000000 if original_x * original_y > megapixel: new_x = int((sqrt((original_x * megapixel) / original_y)).real) new_y = int((sqrt((original_y * megapixel) / original_x)).real) new_image = image.resize((new_x, new_y)) return new_image else: return image def load_product_json_data(path): # Loads all the json from the passed path and puts them in a dictionary paths = glob(path+"*.json") output_dictionary = {} for path in paths: product_name = pathlib.Path(path).stem f = open(path) json_content = json.load(f) output_dictionary[product_name] = json_content return output_dictionary def load_product_dataframe_data(path): # Loads all the json from the passed path and puts them in a dictionary paths = glob(path + "*.json") output_dictionary = {} for path in paths: product_name = pathlib.Path(path).stem dataframe = pd.read_json(path, orient = 'index') output_dictionary[product_name] = dataframe return output_dictionary def run_image_ocr(image_bytes: bytes, google_client) -> ocr.Output: """OCR for a PNG image.""" # Create a client with the secrets in the local key file # Send image to API google_response = google_client.text_detection( image=vision.Image(content=image_bytes) ) # Return response google_response_json = json.loads( vision.AnnotateImageResponse.to_json(google_response) ) return ocr.Output(**google_response_json), google_response_json def get_block_language(block: ocr.Block) -> str: """Returns most confident language in block, or "unk" if there is no language""" if not block.property: return "unk" try: prop = max(block.property.detectedLanguages, key=lambda x: x.confidence) return prop.languageCode except Exception: return "unk" def block2text(block: ocr.Block) -> TextWithLanguage: """ Extract the text from a block. """ lang_code = get_block_language(block) text = "" for paragraph in block.paragraphs: for word in paragraph.words: for symbol in word.symbols: text += symbol.text if symbol.property: if symbol.property.detectedBreak: text += " " return TextWithLanguage(text=text.strip(), lang_code=lang_code) # Takes a list of blocks, and parses the sentences in each block def get_sentences(blocks: List[TextWithLanguage]) -> List[TextWithLanguage]: """ Split the sentences of the blocks and return a list of sentences. """ punkt_param = PunktParameters() punkt_param.abbrev_types = { "dr", "vs", "mr", "mrs", "prof", "inc", "vit", "o.a.", "o.b.v.", "s.p.", "m.u.v.", "i.v.m.", "a.k.a.", "e.g.", "m.b.v.", "max.", "min.", } sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = [] for block in blocks: for sentence in sentence_splitter.tokenize(block.text): try: lang_code = detect(str(sentence).lower()) # Because of lack of context in a sentence # afrikaans is often recognized if lang_code == "af": lang_code = "nl" except LangDetectException: lang_code = "unk" sentences.append(TextWithLanguage(text=str(sentence), lang_code=lang_code)) return sentences def run_ocr(image_bytes: bytes) -> Union[OCROut, Any]: # API response is of the type AnnotateImageResponse, see # https://cloud.google.com/vision/docs/reference/rest/v1/AnnotateImageResponse # for more details. ocr_image_annotation, response_json = run_image_ocr(image_bytes, google_client) # We assume we will only process pictures of one page, # and no documents of more than one page. Hence, we # take the first page here. # Check if the fullTextAnnotations are filled if ocr_image_annotation.fullTextAnnotation: ocr_blocks = ocr_image_annotation.fullTextAnnotation.pages[0].blocks text = ocr_image_annotation.fullTextAnnotation.text blocks = [block2text(block) for block in ocr_blocks] block_texts = [block.text for block in blocks] sentences = get_sentences(blocks) else: block_texts = [""] text = "" sentences = [TextWithLanguage(text="", lang_code="")] ocr_text_out = OCRTextOut(blocks=block_texts, full_text=text, sentences=sentences) return ocr_text_out, response_json