Spaces:

WoC
/

retailpxdemo

Build error

App Files Files

leonge commited on Feb 6, 2023

Commit

9f97a7a

1 Parent(s): fadb73b

Added first scan logic

Browse files

Files changed (9) hide show

app.py +35 -4
base/__init__.py +30 -0
base/__init__.py.bak +44 -0
base/data_models.py +236 -0
base/ocr.py +86 -0
data_models.py +236 -0
train_classifiers.ipynb +0 -0
utils.py +129 -0
woc-logo-black.1a4c4e90.svg +38 -0

app.py CHANGED Viewed

@@ -1,7 +1,38 @@
 import gradio as gr
-def greet(name):
-    return "World of Content demo, hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import json
+from PIL import Image
+from utils import run_ocr, image_to_byte_array
+def extract_all(image):
+    image = Image.fromarray(image)
+    image_byte_array = image_to_byte_array(image)
+    ocr_text_out, response_json = run_ocr(image_byte_array)
+    ocr_full_text = str(ocr_text_out.full_text).replace('\n', ' ')
+    # Extract attributes
+    #output_dictionary, product_description = exctract_fields(ocr_full_text)
+    output_dictionary = {
+        "GPC": 10000045,
+        "BRAND": "Kitkat",
+        "FUNCTIONAL NAME": "Chocolate Bar",
+        "Weight": "41.5",
+        "Unit": "Gr",
+        "Contact Information": {"Website":  "www.kitkat.com", "Adress": " Nestlé Deutschland AG, 60523 \n Frankfurt am Main, Germany"},
+        "Allergen_Statement_NL": "Bevat: MELK, TARWE",
+}
+    output_dict_json = str(json.dumps(output_dictionary,indent = 2))
+    return str(output_dict_json),  ocr_full_text
+output_df = [gr.Dataframe(label="Data")]
+attributes_tbox = gr.Textbox(label='Attributes')
+#product_description_tbox = gr.Textbox(label='Product Description')
+ocr_output_tbox = gr.Textbox(label='OCR Output')
+gr.Button.style("{color: blue}")
+gr.Interface(fn=extract_all, inputs="image", outputs=[attributes_tbox, ocr_output_tbox],
+             title= 'World of Content',  description="GS1 Global Extractor",
+             css="body {background-color: #F5F7FA} .gr-button.gr-button-primary {background-color: #0080fa; color:white; --tw-gradient-from:0} .gr-button.gr-button-secondary {background-color: #172533; color:white; --tw-gradient-from:0} h1 {background-image: url('file=woc-logo-black.1a4c4e90.svg'); background-size:contain; background-repeat:no-repeat; background-position:center; text-indent:-999999999px} .ou![](Data PoC 3/Testing Data/40052526_0003.png)tput-markdown p{text-align:center; font-size:24px}").launch()

base/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from data_models import (  # noqa 401, 403
+    Allergen,
+    AllergensOut,
+    Attribute,
+    AttributeAllergen,
+    AttributeCommunicationChannel,
+    ClassifiedText,
+    CommunicationChannels,
+    CommunicationChannelsOut,
+    ModelOut,
+    ModelOutList,
+    NetContent,
+    NetContentAttribute,
+    NutrientTable,
+    NutrientTableDailyValueIntake,
+    NutrientTableElement,
+    NutrientTableQuantity,
+    OCROut,
+    OCROutList,
+    OCRTableOut,
+    OCRTextOut,
+    OCRWrapperOut,
+    PipelineInput,
+    PipelineOutput,
+    RedirectInput,
+    TextWithLanguage,
+    TrainModelOut,
+)
+from base.ocr import *  # noqa 401, 403

base/__init__.py.bak ADDED Viewed

	@@ -0,0 +1,44 @@

+# isort: skip_file
+from kedro.extras.datasets.json import JSONDataSet  # noqa 401
+from kedro.extras.datasets.pandas import CSVDataSet  # noqa 401
+from kedro.extras.datasets.pandas import ParquetDataSet  # noqa 401
+from kedro.extras.datasets.pandas import ExcelDataSet as excel_dataset  # noqa 401
+from kedro.extras.datasets.pandas import JSONDataSet as pandas_json  # noqa 401
+from kedro.extras.datasets.pandas.parquet_dataset import ParquetDataSet  # noqa 401
+from kedro.extras.datasets.pillow import ImageDataSet  # noqa 401
+from kedro.extras.datasets.text import TextDataSet  # noqa 401
+from kedro.io.core import AbstractDataSet as DataSet  # noqa 401
+from certifai.base.data_models import (  # noqa 401, 403
+    Allergen,
+    AllergensOut,
+    Attribute,
+    AttributeAllergen,
+    AttributeCommunicationChannel,
+    ClassifiedText,
+    CommunicationChannels,
+    CommunicationChannelsOut,
+    ModelOut,
+    ModelOutList,
+    NetContent,
+    NetContentAttribute,
+    NutrientTable,
+    NutrientTableDailyValueIntake,
+    NutrientTableElement,
+    NutrientTableQuantity,
+    OCROut,
+    OCROutList,
+    OCRTableOut,
+    OCRTextOut,
+    OCRWrapperOut,
+    PipelineInput,
+    PipelineOutput,
+    RedirectInput,
+    TextWithLanguage,
+    TrainModelOut,
+)
+from certifai.base.abstract import BaseClassifier  # noqa 401, 403
+from certifai.base.s3_helper_functions import *  # noqa 401, 403
+from certifai.base.custom_datasets import *  # noqa 401, 403
+from certifai.base.ocr import *  # noqa 401, 403
+from kedro.extras.datasets.yaml import YAMLDataSet  # noqa 401

base/data_models.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Pydantic models used throughout the codebase.
+In particular, these are the types that are used as input and output of each step of the pipeline.
+"""
+import json
+from typing import Any, Optional, Union
+from pydantic import BaseModel, Field
+class RedirectInput(BaseModel):
+    pipeline_arn: str
+    job_id: str
+class NutrientTableQuantity(BaseModel):
+    measurementUnitCode: str
+    value: str
+    precisionCode: str
+    def __str__(self):
+        return f"{self.precisionCode} {self.value} {self.measurementUnitCode}"
+class NutrientTableDailyValueIntake(BaseModel):
+    value: str
+    precisionCode: str
+    def __str__(self):
+        return f"{self.precisionCode} {self.value}%"
+def s(
+    text: Optional[Union[NutrientTableQuantity, NutrientTableDailyValueIntake, str]]
+) -> str:
+    """
+    Returns None as "?", otherwise argument as string.
+    """
+    if text:
+        return str(text)
+    else:
+        return "?"
+class NutrientTableElement(BaseModel):
+    coordinates: str
+    probability: float
+    nutrientTypeCode: Optional[str]
+    quantityContained: NutrientTableQuantity
+    dailyValueIntakePercent: Optional[NutrientTableDailyValueIntake]
+    precisionCode: str
+    def __str__(self):
+        return " ".join(
+            [
+                s(self.nutrientTypeCode),
+                s(self.quantityContained),
+                f"({s(self.dailyValueIntakePercent)})",
+            ]
+        )
+class NutrientTable(BaseModel):
+    nutrientBasisQuantityValue: Optional[str]
+    nutrientBasisQuantityMeasurementUnitCode: Optional[str]
+    preperationStateCode: Optional[str]
+    values: list[NutrientTableElement]
+    def __str__(self):
+        top = "Nutrients per " + " ".join(
+            [
+                s(self.nutrientBasisQuantityValue),
+                s(self.nutrientBasisQuantityMeasurementUnitCode),
+                f"({s(self.preperationStateCode)})",
+            ]
+        )
+        vals = "\n\t".join([str(v) for v in self.values])
+        return f"{top}\n\t{vals}"
+class Attribute(BaseModel):
+    coordinates: str
+    entity: str
+    probability: float
+    value: Union[str, list[NutrientTable]]
+    model: str
+class AttributeCommunicationChannel(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    communicationChannelCode: str
+    communicationValue: str
+class AttributeAllergen(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    allergenTypeCode: str
+    levelOfContainmentCode: str
+class NetContentAttribute(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    measurementUnitCode: str
+    value: str
+class AllergensOut(BaseModel):
+    entity: str
+    values: list[AttributeAllergen]
+    model: str
+class CommunicationChannelsOut(BaseModel):
+    entity: str
+    values: list[AttributeCommunicationChannel]
+    model: str
+class PipelineInput(BaseModel):
+    image_key: str
+class PipelineOutput(BaseModel):
+    attributes: list[
+        Union[Attribute, CommunicationChannelsOut, AllergensOut, NetContentAttribute]
+    ]
+    job_id: str = Field(alias="job-id")
+    text: str
+    class Config:
+        allow_population_by_field_name = True
+class TextWithLanguage(BaseModel):
+    text: str
+    lang_code: str
+class OCRTextOut(BaseModel):
+    blocks: list[str]
+    full_text: str
+    sentences: list[TextWithLanguage]
+class OCRTableOut(BaseModel):
+    tables: list[list[list[str]]]
+class OCROut(BaseModel):
+    result: Union[OCRTextOut, OCRTableOut]
+    job_id: str
+class OCROutList(BaseModel):
+    __root__: list[OCROut]
+    def __iter__(self):
+        return iter(self.__root__)
+    def __getitem__(self, item):
+        return self.__root__[item]
+class OCRWrapperOut(BaseModel):
+    blocks: list[str]
+    full_text: str
+    job_id: str
+    sentences: list[TextWithLanguage]
+    tables: list[list[list[str]]]
+class ClassifiedText(BaseModel):
+    text: str
+    attribute: str
+    confidence: float
+class CommunicationChannels(BaseModel):
+    confidence: float
+    attribute: str
+    communicationChannelCode: str
+    communicationValue: str
+    text: Optional[str] = ""
+class Allergen(BaseModel):
+    confidence: float
+    attribute: str
+    allergenTypeCode: str
+    levelOfContainmentCode: str
+    text: Optional[str] = ""
+class NetContent(BaseModel):
+    confidence: float
+    attribute: str
+    measurementUnitCode: str
+    value: str
+    text: Optional[str] = ""
+class ModelOut(BaseModel):
+    blocks: list[Union[NetContent, Allergen, CommunicationChannels, ClassifiedText]]
+    tables: Optional[list[NutrientTable]]
+    job_id: str
+    model: str
+    full_text: str
+    def toJSON(self):
+        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True)
+class ModelOutList(BaseModel):
+    __root__: list[ModelOut]
+    def __iter__(self):
+        return iter(self.__root__)
+    def __getitem__(self, item):
+        return self.__root__[item]
+class TrainModelOut(BaseModel):
+    # To be defined later when we have a list of accepted formats
+    model: Optional[Any] = None
+    artifacts: Optional[Any] = None

base/ocr.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Custom types for dealing with the Google Vision API JSON output.
+"""
+from enum import IntEnum
+from typing import Any, Optional
+from pydantic import BaseModel
+class BreakType(IntEnum):
+    UNKNOWN = 0
+    SPACE = 1
+    SURE_SPACE = 2
+    EOL_SURE_SPACE = 3
+    LINE_BREAK = 4
+    HYPHEN = 5
+class BlockType(IntEnum):
+    UNKNOWN = 0
+    TEXT = 1
+    TABLE = 2
+    PICTURE = 3
+    RULER = 4
+    BARCODE = 5
+class DetectedBreak(BaseModel):
+    type: BreakType
+    is_prefix: Optional[bool]
+class DetectedLanguage(BaseModel):
+    languageCode: str
+    confidence: float
+class TextProperty(BaseModel):
+    detectedLanguages: list[DetectedLanguage]
+    detectedBreak: Optional[DetectedBreak]
+class Symbol(BaseModel):
+    property: Optional[TextProperty]
+    boundingBox: Any
+    text: str
+    confidence: float
+class Word(BaseModel):
+    property: Optional[TextProperty]
+    boundingBox: Any
+    symbols: list[Symbol]
+    confidence: float
+class Paragraph(BaseModel):
+    property: Optional[TextProperty]
+    boundingBox: Any
+    words: list[Word]
+    confidence: float
+class Block(BaseModel):
+    property: Optional[TextProperty]
+    boundingBox: Any
+    paragraphs: list[Paragraph]
+    blockType: BlockType
+    confidence: float
+class Page(BaseModel):
+    property: Optional[TextProperty]
+    width: int
+    height: int
+    blocks: list[Block]
+    confidence: float
+class TextAnnotation(BaseModel):
+    pages: list[Page]
+    text: str
+class Output(BaseModel):
+    fullTextAnnotation: Optional[TextAnnotation] = None

data_models.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Pydantic models used throughout the codebase.
+In particular, these are the types that are used as input and output of each step of the pipeline.
+"""
+import json
+from typing import Any, Optional, Union
+from pydantic import BaseModel, Field
+class RedirectInput(BaseModel):
+    pipeline_arn: str
+    job_id: str
+class NutrientTableQuantity(BaseModel):
+    measurementUnitCode: str
+    value: str
+    precisionCode: str
+    def __str__(self):
+        return f"{self.precisionCode} {self.value} {self.measurementUnitCode}"
+class NutrientTableDailyValueIntake(BaseModel):
+    value: str
+    precisionCode: str
+    def __str__(self):
+        return f"{self.precisionCode} {self.value}%"
+def s(
+    text: Optional[Union[NutrientTableQuantity, NutrientTableDailyValueIntake, str]]
+) -> str:
+    """
+    Returns None as "?", otherwise argument as string.
+    """
+    if text:
+        return str(text)
+    else:
+        return "?"
+class NutrientTableElement(BaseModel):
+    coordinates: str
+    probability: float
+    nutrientTypeCode: Optional[str]
+    quantityContained: NutrientTableQuantity
+    dailyValueIntakePercent: Optional[NutrientTableDailyValueIntake]
+    precisionCode: str
+    def __str__(self):
+        return " ".join(
+            [
+                s(self.nutrientTypeCode),
+                s(self.quantityContained),
+                f"({s(self.dailyValueIntakePercent)})",
+            ]
+        )
+class NutrientTable(BaseModel):
+    nutrientBasisQuantityValue: Optional[str]
+    nutrientBasisQuantityMeasurementUnitCode: Optional[str]
+    preperationStateCode: Optional[str]
+    values: list[NutrientTableElement]
+    def __str__(self):
+        top = "Nutrients per " + " ".join(
+            [
+                s(self.nutrientBasisQuantityValue),
+                s(self.nutrientBasisQuantityMeasurementUnitCode),
+                f"({s(self.preperationStateCode)})",
+            ]
+        )
+        vals = "\n\t".join([str(v) for v in self.values])
+        return f"{top}\n\t{vals}"
+class Attribute(BaseModel):
+    coordinates: str
+    entity: str
+    probability: float
+    value: Union[str, list[NutrientTable]]
+    model: str
+class AttributeCommunicationChannel(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    communicationChannelCode: str
+    communicationValue: str
+class AttributeAllergen(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    allergenTypeCode: str
+    levelOfContainmentCode: str
+class NetContentAttribute(BaseModel):
+    coordinates: str
+    probability: float
+    model: str
+    entity: str
+    measurementUnitCode: str
+    value: str
+class AllergensOut(BaseModel):
+    entity: str
+    values: list[AttributeAllergen]
+    model: str
+class CommunicationChannelsOut(BaseModel):
+    entity: str
+    values: list[AttributeCommunicationChannel]
+    model: str
+class PipelineInput(BaseModel):
+    image_key: str
+class PipelineOutput(BaseModel):
+    attributes: list[
+        Union[Attribute, CommunicationChannelsOut, AllergensOut, NetContentAttribute]
+    ]
+    job_id: str = Field(alias="job-id")
+    text: str
+    class Config:
+        allow_population_by_field_name = True
+class TextWithLanguage(BaseModel):
+    text: str
+    lang_code: str
+class OCRTextOut(BaseModel):
+    blocks: list[str]
+    full_text: str
+    sentences: list[TextWithLanguage]
+class OCRTableOut(BaseModel):
+    tables: list[list[list[str]]]
+class OCROut(BaseModel):
+    result: Union[OCRTextOut, OCRTableOut]
+    job_id: str
+class OCROutList(BaseModel):
+    __root__: list[OCROut]
+    def __iter__(self):
+        return iter(self.__root__)
+    def __getitem__(self, item):
+        return self.__root__[item]
+class OCRWrapperOut(BaseModel):
+    blocks: list[str]
+    full_text: str
+    job_id: str
+    sentences: list[TextWithLanguage]
+    tables: list[list[list[str]]]
+class ClassifiedText(BaseModel):
+    text: str
+    attribute: str
+    confidence: float
+class CommunicationChannels(BaseModel):
+    confidence: float
+    attribute: str
+    communicationChannelCode: str
+    communicationValue: str
+    text: Optional[str] = ""
+class Allergen(BaseModel):
+    confidence: float
+    attribute: str
+    allergenTypeCode: str
+    levelOfContainmentCode: str
+    text: Optional[str] = ""
+class NetContent(BaseModel):
+    confidence: float
+    attribute: str
+    measurementUnitCode: str
+    value: str
+    text: Optional[str] = ""
+class ModelOut(BaseModel):
+    blocks: list[Union[NetContent, Allergen, CommunicationChannels, ClassifiedText]]
+    tables: Optional[list[NutrientTable]]
+    job_id: str
+    model: str
+    full_text: str
+    def toJSON(self):
+        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True)
+class ModelOutList(BaseModel):
+    __root__: list[ModelOut]
+    def __iter__(self):
+        return iter(self.__root__)
+    def __getitem__(self, item):
+        return self.__root__[item]
+class TrainModelOut(BaseModel):
+    # To be defined later when we have a list of accepted formats
+    model: Optional[Any] = None
+    artifacts: Optional[Any] = None

train_classifiers.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import io
+import json
+from google.cloud import vision
+from base import OCROut, OCRTextOut, TextWithLanguage, ocr
+from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer
+from langdetect import detect
+from langdetect.lang_detect_exception import LangDetectException
+from typing import Any, Optional, Union
+gcp_credentials: str = '{"type": "service_account",  "project_id": "pivotal-pattern-355407",  "private_key_id": "92058c1fb443ce736215886b81d83a20b8dce873",  "private_key": "-----BEGIN PRIVATE KEY-----\\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC4TVUGgfem6xzd\\nDKtas6PI35kji0/TeVL0KO8kVmVJkGjwNwlBV3jxhsDGHN6j7d4INsC8Ahlm8Yyk\\nwGfoPmxbTikXIzlT7Ze7A+K7vHMNz57Hqu9aMx1mZ2AsYkOmrXuiCcgA12xWE0ut\\nQ7YCkPIcSNRpx6SfmWX5M30wGD/jCY0x+vUb5hqyF81TfX6bi3r/1TNXB0Rbuil0\\nP0hb7G7hiIVBJvYgx1ev6u67j/20kO2hH8o/GRvEdm5PK3DboavBtBrpeFHvnbGL\\nX23+EnmvIoUisM9FMlk9efJwy27/9bDORK26q0LwVQeZ8gxeEwKkaM67iRObkQj5\\ni+l/iF/TAgMBAAECggEAAcAHk6qTncvTwqxGdZ5easo8xl/3Smx3g4az+/taC4sf\\nLqYceg4I7E+O8GgBG+YRWrCdFJah2R69atRs6CwCPr0M74GlH2KSiJ3D1GvjXSPg\\nlR15bQ4uiZ9nrrmMJ5QJfYNsQ8hyk0ABqEvhTuruHE37aQ8Smsq2c4RapvnupArv\\nVKZRi7NMCGGzjwtNisREKdpFft3+aUQ1WnVzmi9aDP4cSYUmfkGz5s2g44vqisf1\\nZeJtxxU1kmXDBuq/n6bBhmeBFEqXqNMH4D19Fr/Px8gfwVsa4t6CFJ9yeRHHM//r\\nauCL+tVsW8jja/DiVPgVAG813HVJfG9t8E+P24lU2QKBgQD/cYEJ4OXG/CZXyQdW\\nd6bcecNVDqdm30JUDev6enk3JviZ6Ah2IJW56kAC66YW80T5cn47t43Cb3yRr+aK\\nMC1dkN2r7F8BCiyZQ52vqGUlq0OFbei2MEUSCvZ9tie8n15z6/TW1GDS5yMUuIU9\\nBVirsuM0tXm2Wj/PldFxuJFB5QKBgQC4tCSNBboYVWJVAuiL7wtQuotZW6Tbu9YN\\nnjZ79v8eW3kK5nH/5uIo4FMnqTB7GsQiC1NmsRi5ELDgl/UDTNLYM/q1RwDt3Msq\\n9D0m267BdzoZSXm0LFlZGx+YKtYLUkesuBgXU2iGTPeWq4yCWLHZrdXU2L/+5RJF\\nJsdsxGtfVwKBgQDbmaOab3p6X4lDDLK/Niv//LndZLSrbqHiCvnkoueUb29nOKAV\\nlZvCcczH9fgaYNbaMACvq/Q1xIxt3q+t+w71EjZPt+xQw4Nl20gzm+wgvyRUEBHw\\nlny6rywLFM8CjGux8pbRHVGD3ms9RAhfVjpNmYNUa/MPvnR/LEgboNVXtQKBgGAD\\npQx58ac8m4U9oc08UlGA53doIGbpWfhySjheyXfqbR2xdYllaCN/mTqAxOb5nwNj\\nh9NWNFffyVK3K3KvuNsTCjy50E3V/PczR6avhESlydnjTBTRCRE8/EuJ2QLBnui2\\nOi/F+Av4bqwwOTZ5DNrvdrzSf9vtEHZKFlkAMiC5AoGAf5awqCDCmV33ztE3mV9E\\nrJiN+fJTAnspbG3Dr6NooU2AVKIQuGuLm6bideM8qjDi7CDlVTOVSlkimD/K29wx\\nMxqNoghUGkP+uOcdWO4zshG+A3Z5qysXKUxc6dphYjzsZS//v/GLYNymygo0+x9O\\nnnytiKiF8KLRDjYiD2OK+/c=\\n-----END PRIVATE KEY-----\\n",  "client_email": "certifai-ocr@pivotal-pattern-355407.iam.gserviceaccount.com",  "client_id": "100756892217388297600",  "auth_uri": "https://accounts.google.com/o/oauth2/auth",  "token_uri": "https://oauth2.googleapis.com/token",  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/certifai-ocr%40pivotal-pattern-355407.iam.gserviceaccount.com"}'
+creds = json.loads(gcp_credentials)
+google_client = vision.ImageAnnotatorClient.from_service_account_info(creds)
+def image_to_byte_array(image) -> bytes:
+    imgByteArr = io.BytesIO()
+    image.save(imgByteArr, format="png")
+    image_buffer: bytes = imgByteArr.getvalue()
+    return image_buffer
+def run_image_ocr(image_bytes: bytes, google_client) -> ocr.Output:
+    """OCR for a PNG image."""
+    # Create a client with the secrets in the local key file
+    # Send image to API
+    google_response = google_client.text_detection(
+        image=vision.Image(content=image_bytes)
+    )
+    # Return response
+    google_response_json = json.loads(
+        vision.AnnotateImageResponse.to_json(google_response)
+    )
+    return ocr.Output(**google_response_json), google_response_json
+def get_block_language(block: ocr.Block) -> str:
+    """Returns most confident language in block, or "unk" if there is no language"""
+    if not block.property:
+        return "unk"
+    try:
+        prop = max(block.property.detectedLanguages, key=lambda x: x.confidence)
+        return prop.languageCode
+    except Exception:
+        return "unk"
+def block2text(block: ocr.Block) -> TextWithLanguage:
+    """
+    Extract the text from a block.
+    """
+    lang_code = get_block_language(block)
+    text = ""
+    for paragraph in block.paragraphs:
+        for word in paragraph.words:
+            for symbol in word.symbols:
+                text += symbol.text
+                if symbol.property:
+                    if symbol.property.detectedBreak:
+                        text += " "
+    return TextWithLanguage(text=text.strip(), lang_code=lang_code)
+# Takes a list of blocks, and parses the sentences in each block
+def get_sentences(blocks: list[TextWithLanguage]) -> list[TextWithLanguage]:
+    """
+    Split the sentences of the blocks and return a list of sentences.
+    """
+    punkt_param = PunktParameters()
+    punkt_param.abbrev_types = {
+        "dr",
+        "vs",
+        "mr",
+        "mrs",
+        "prof",
+        "inc",
+        "vit",
+        "o.a.",
+        "o.b.v.",
+        "s.p.",
+        "m.u.v.",
+        "i.v.m.",
+        "a.k.a.",
+        "e.g.",
+        "m.b.v.",
+        "max.",
+        "min.",
+    }
+    sentence_splitter = PunktSentenceTokenizer(punkt_param)
+    sentences = []
+    for block in blocks:
+        for sentence in sentence_splitter.tokenize(block.text):
+            try:
+                lang_code = detect(str(sentence).lower())
+                # Because of lack of context in a sentence
+                # afrikaans is often recognized
+                if lang_code == "af":
+                    lang_code = "nl"
+            except LangDetectException:
+                lang_code = "unk"
+            sentences.append(TextWithLanguage(text=str(sentence), lang_code=lang_code))
+    return sentences
+def run_ocr(image_bytes: bytes) -> Union[OCROut, Any]:
+    # API response is of the type AnnotateImageResponse, see
+    # https://cloud.google.com/vision/docs/reference/rest/v1/AnnotateImageResponse
+    # for more details.
+    ocr_image_annotation, response_json = run_image_ocr(image_bytes, google_client)
+    # We assume we will only process pictures of one page,
+    # and no documents of more than one page. Hence, we
+    # take the first page here.
+    # Check if the fullTextAnnotations are filled
+    if ocr_image_annotation.fullTextAnnotation:
+        ocr_blocks = ocr_image_annotation.fullTextAnnotation.pages[0].blocks
+        text = ocr_image_annotation.fullTextAnnotation.text
+        blocks = [block2text(block) for block in ocr_blocks]
+        block_texts = [block.text for block in blocks]
+        sentences = get_sentences(blocks)
+    else:
+        block_texts = [""]
+        text = ""
+        sentences = [TextWithLanguage(text="", lang_code="")]
+    ocr_text_out = OCRTextOut(blocks=block_texts, full_text=text, sentences=sentences)
+    return ocr_text_out, response_json

woc-logo-black.1a4c4e90.svg ADDED Viewed