nicks
[Added] new attributes to mapping and handled contactinfo differently
75f0880
import json
from zipfile import ZipFile
import gradio as gr
import pandas as pd
def create_output_attribute(attribute_name, attribute_value, attribute_language):
out_dict = {
"coordinates": "None",
"entity": attribute_name,
"probability": 1.0,
"value": attribute_value,
"model": "NER",
"language": attribute_language
}
return out_dict
def create_allergen_attribute(allergen_type_code, allergen_containment_level='None'):
out_dict = {
"coordinates": "None",
"probability": 1.0,
"model": "rule-based",
"entity": "Allergen",
"allergenTypeCode": allergen_type_code,
"levelOfContainmentCode": allergen_containment_level
}
return out_dict
def extract_attribute_value_from_df(df, attribute_id):
try:
return df[df['attribute_id'] == attribute_id].iloc[0]['attribute_value']
except IndexError:
return None
def extract_value_and_unit(value_in):
if value_in is None:
return value_in, value_in
split = value_in.split(' ')
if len(split) == 2:
return split[0], split[1]
else:
return value_in, None
def process_normal_attributes(pxm_output, certifai_mapping, desired_language):
# Process the 'simple' attributes
output_list = []
gtin = None
hierarchy_attributes = []
for attribute in pxm_output['data']:
attribute_id = str(attribute['attribute_id'])
attribute_language = attribute['locale']['value']
attribute_value = str(attribute['value'])
if attribute['path'] is not None:
hierarchy_attributes.append({'attribute_id': attribute_id, 'attribute_value': attribute_value,
'attribute_language': attribute_language, 'path': attribute['path']})
if attribute_id in certifai_mapping.keys() and attribute_language in desired_language and attribute['path'] is None:
attribute_name = certifai_mapping[attribute_id]
output_list.append(create_output_attribute(attribute_name, attribute_value, attribute_language))
# Filter out the gtin
if attribute_id == '3603':
gtin = attribute_value
return output_list, hierarchy_attributes, gtin
def extract_path_hierarchy(hierarchy_attributes_df):
# Preprocess the path column from the attributes df to extract the path hierarchy
path_list = []
for element in hierarchy_attributes_df['path'].str.split('.'):
if len(element) == 1:
path_list.append([element[0], None, None])
elif len(element) == 2:
path_list.append([element[0], element[1], None])
elif len(element) == 3:
path_list.append(element)
hierarchy_attributes_df[['path.0', 'path.1', 'path.2']] = path_list
return hierarchy_attributes_df
def process_allergens(hierarchy_attributes_df):
# Allergens
allergens_df = hierarchy_attributes_df[(hierarchy_attributes_df['attribute_id'].isin(['5184', '5191']))]
allergens_list = []
if len(allergens_df) > 0:
allergens_df = allergens_df[allergens_df['attribute_language'] == allergens_df.iloc[0]['attribute_language']]
for path, group in allergens_df.groupby('path'):
allergen_type_code = group[group['attribute_id'] == '5184'].iloc[0]['attribute_value']
allergen_containment_level_code = group[group['attribute_id'] == '5191'].iloc[0]['attribute_value']
allergens_list.append(create_allergen_attribute(allergen_type_code, allergen_containment_level_code))
allergen_attribute = {'entity': 'allergens', 'values': allergens_list, 'model': 'rule-based'}
return allergen_attribute
# def process_communication_channels(hierarchy_attributes_df):
# # Communication Channels
# communication_channels_df = hierarchy_attributes_df[
# (hierarchy_attributes_df['attribute_id'].isin(['2400', '2401']))]
# communication_channels_df = communication_channels_df[
# communication_channels_df['attribute_language'] == communication_channels_df.iloc[0]['attribute_language']]
# communication_channels_list = []
# for path_1, group in communication_channels_df.groupby('path'):
# communication_channel_dict = {
# "coordinates": "None",
# "probability": 1.0,
# "model": "rule-based",
# "entity": "CommunicationChannel",
# "communicationChannelCode": extract_attribute_value_from_df(group, '2400'),
# "communicationValue": extract_attribute_value_from_df(group, '2401')
# }
# communication_channels_list.append(communication_channel_dict)
# communication_channel_attribute = {'entity': 'communicationChannels', 'values': communication_channels_list}
# return communication_channel_attribute
def process_communication_channels(hierarchy_attributes_df):
# Communication Channels
communication_channels_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['4896', '4897', "4898", "4900", "4901"]))]
communication_channels_df = communication_channels_df[
communication_channels_df['attribute_language'] == communication_channels_df.iloc[0]['attribute_language']]
communication_channels_list = []
for path_1, group in communication_channels_df.groupby('path'):
communication_channel_code = extract_attribute_value_from_df(group, '4900')
communication_value = extract_attribute_value_from_df(group, '4901')
if communication_channel_code and communication_value:
communication_channel_dict = {
"path": group["path.0"].iloc[0],
"coordinates": "None",
"probability": 1.0,
"model": "rule-based",
"entity": "CommunicationChannel",
"communicationChannelCode": communication_channel_code,
"communicationValue": communication_value,
}
communication_channels_list.append(communication_channel_dict)
return communication_channels_list
def process_contact_information(hierarchy_attributes_df, communication_channels_list):
# Contact information
contact_information_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['4896', '4897', "4898"]))]
contact_information_df = contact_information_df[
contact_information_df['attribute_language'] == contact_information_df.iloc[0]['attribute_language']]
contact_information_list = []
for i, (path_1, group) in enumerate(contact_information_df.groupby('path')):
contact_name = extract_attribute_value_from_df(group, '4896')
contact_address = extract_attribute_value_from_df(group, '4897')
contact_type_code = extract_attribute_value_from_df(group, '4898')
if contact_name and contact_address and contact_type_code:
communication_channels = [dic for dic in communication_channels_list if dic["path"] == str(i)]
contact_information_dict = {
"contactInformation": str(i),
"contactName": contact_name,
"contact_address": contact_address,
"contactTypeCode": contact_type_code,
"communicationChannels": communication_channels,
}
contact_information_list.append(contact_information_dict)
contact_information_out = {'entity': 'contact_information', 'values': contact_information_list}
return contact_information_out
def process_preparation_instructions(hierarchy_attributes_df):
# Preparation instructions
preparation_instructions_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['5206', '5207']))]
preparation_instructions_df = preparation_instructions_df[
preparation_instructions_df['attribute_language'] == preparation_instructions_df.iloc[0]['attribute_language']]
preparation_instructions_list = []
for path_1, group in preparation_instructions_df.groupby('path'):
preparation_instructions_dict = {
"coordinates": "None",
"probability": 1.0,
"model": "rule-based",
"entity": "PreparationInstruction",
"preparationTypeCode": extract_attribute_value_from_df(group, '5206'),
"preparationInstructions": extract_attribute_value_from_df(group, '5207')
}
preparation_instructions_list.append(preparation_instructions_dict)
preparation_instructions_attribute = {'entity': 'preparationInstructions', 'values': preparation_instructions_list}
return preparation_instructions_attribute
def process_diet_information(hierarchy_attributes_df):
# Diet Information
diet_information_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['5203', '5204']))]
diet_information_df = diet_information_df[
diet_information_df['attribute_language'] == diet_information_df.iloc[0]['attribute_language']]
diet_information_list = []
for path_1, group in diet_information_df.groupby('path'):
diet_information_dict = {
"coordinates": "None",
"probability": 1.0,
"model": "rule-based",
"entity": "DietInformation",
"dietTypeCode": extract_attribute_value_from_df(group, '5203'),
"isDietTypeMarkedOnPackage": extract_attribute_value_from_df(group, '5204')
}
diet_information_list.append(diet_information_dict)
diet_information_attribute = {'entity': 'dietInformation', 'values': diet_information_list}
return diet_information_attribute
def process_claim_element_information(hierarchy_attributes_df):
# Claim element information
claim_element_information_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['5199', '5200', '5201']))]
claim_element_information_df = claim_element_information_df[
claim_element_information_df['attribute_language'] == claim_element_information_df.iloc[0]['attribute_language']]
claim_element_information_list = []
for path_1, group in claim_element_information_df.groupby('path'):
claim_element_information_dict = {
"coordinates": "None",
"probability": 1.0,
"model": "rule-based",
"entity": "ClaimElementInformation",
"claimElementCode": extract_attribute_value_from_df(group, '5199'),
"claimTypeCode": extract_attribute_value_from_df(group, '5200'),
"claimMarkedOnPackage": extract_attribute_value_from_df(group, '5201'),
}
claim_element_information_list.append(claim_element_information_dict)
claim_element_information_attribute = {'entity': 'claimElementInformation', 'values': claim_element_information_list}
return claim_element_information_attribute
def process_nutrient_table(hierarchy_attributes_df):
# Nutrient table
nutrient_table_df = hierarchy_attributes_df[
(hierarchy_attributes_df['attribute_id'].isin(['5211', '5212', '5219', '5215', '5216', '5217']))]
nutrient_table_list = []
if len(nutrient_table_df) > 0:
preferred_language = 'en-GB'
nutrient_table_df = nutrient_table_df[nutrient_table_df['attribute_language'] == nutrient_table_df.iloc[0]['attribute_language']]
for path_0, group in nutrient_table_df.groupby('path.0'):
content = extract_attribute_value_from_df(group, '5211')
nutrient_value, nutrient_unit = extract_value_and_unit(content)
nutrient_basis_quantity_dict = {
"nutrientBasisQuantityValue": nutrient_value,
"nutrientBasisQuantityMeasurementUnitCode": nutrient_unit,
"preparationStateCode": extract_attribute_value_from_df(group, '5212')}
nutrient_values_list = []
for path_1, sub_group in group.sort_values(by='path.1').groupby('path.1'):
content = extract_attribute_value_from_df(sub_group, '5219')
nutrient_value, nutrient_unit = extract_value_and_unit(content)
nutrient_values_dict = {
"coordinates": "",
"probability": 1.0,
"nutrientTypeCode": extract_attribute_value_from_df(sub_group, '5215'),
"quantityContained": {
"measurementUnitCode": nutrient_unit,
"value": nutrient_value,
"precisionCode": extract_attribute_value_from_df(sub_group, '5216')
},
"dailyValueIntakePercent": {
'value': extract_attribute_value_from_df(sub_group, '5217'),
"precisionCode": "APPROXIMATELY"
}
}
nutrient_values_list.append(nutrient_values_dict)
nutrient_basis_quantity_dict['values'] = nutrient_values_list
nutrient_table_list.append(nutrient_basis_quantity_dict)
nutrient_attribute = {"coordinates": "None",
"entity": "nutrients",
"probability": 1.0,
"value": nutrient_table_list,
"model": "table-rule-based"}
return nutrient_attribute
def pad_gtin(gtin, desired_length=14):
for i in range(desired_length - len(gtin)):
gtin = '0' + gtin
return gtin
# Load in the attribute name mappings
with open("field_mapping.json") as f:
certifai_mapping = json.load(f)
def process_file(filename, pxm_output):
# Process a JSON file and map it to the Certifai output format
desired_language = ["en-GB"]
# Get the languages to extract
language = filename.strip(".json").split("_")[1]
if language not in desired_language:
desired_language.append(language)
# Process the normal attributes
output_list, hierarchy_attributes, gtin = process_normal_attributes(
pxm_output, certifai_mapping, desired_language
)
# Process the complex attributes
hierarchy_attributes_df = pd.DataFrame(hierarchy_attributes)
if len(hierarchy_attributes_df) > 0:
hierarchy_attributes_df = extract_path_hierarchy(hierarchy_attributes_df)
# Process the allergens
allergen_attribute = process_allergens(hierarchy_attributes_df)
output_list.append(allergen_attribute)
# # Process the communication channels
# try:
# communication_channel_attribute = process_communication_channels(hierarchy_attributes_df)
# output_list.append(communication_channel_attribute)
# except:
# pass
# Process the communication channels
try:
communication_channels_list = process_communication_channels(hierarchy_attributes_df)
except Exception as e:
print(e)
communication_channels_list = []
# Process contact information
try:
contact_information_attribute = process_contact_information(hierarchy_attributes_df, communication_channels_list)
output_list.append(contact_information_attribute)
except Exception as e:
print(e)
pass
# Process preparation instructions
try:
preparation_instructions_attribute = process_preparation_instructions(hierarchy_attributes_df)
output_list.append(preparation_instructions_attribute)
except Exception as e:
print(e)
pass
# Process diet information
try:
diet_information_attribute = process_diet_information(hierarchy_attributes_df)
output_list.append(diet_information_attribute)
except Exception as e:
print(e)
pass
# Process claim element information
try:
claim_element_information_attribute = process_claim_element_information(hierarchy_attributes_df)
output_list.append(claim_element_information_attribute)
except Exception as e:
print(e)
pass
# Process the nutrient table
nutrient_table_attribute = process_nutrient_table(hierarchy_attributes_df)
output_list.append(nutrient_table_attribute)
output_dict = {"attributes": output_list, "text": "OCR Output:"}
gtin = pad_gtin(gtin)
return gtin, output_dict
def create_demo_data(files):
gtins = []
for _, file in enumerate(files):
pxm_output = json.load(open(file.orig_name, "r"))
gtin, output_dict = process_file(filename=file.orig_name, pxm_output=pxm_output)
with open(f"{gtin}.json", "w") as my_file:
json.dump(output_dict, my_file)
gtins.append(gtin)
with ZipFile("demo_data.zip", "w") as zip_object:
for gtin in gtins:
zip_object.write(f"{gtin}.json")
return "demo_data.zip"
output = gr.Textbox(label="Result")
gr.Interface(
fn=create_demo_data,
inputs=gr.File(file_count="multiple", file_types=[".json"]),
outputs="file",
title="Demo data creator",
).launch()