Spaces:

matthewfarant
/

fertilizer-catalog-engine

Runtime error

App Files Files Community

matthewfarant commited on Nov 3, 2023

Commit

e5890ec

1 Parent(s): cb7b577

Initial commit

Browse files

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import functions.extract_function as get
+import functions.preprocessing_function as preprocess
+import functions.modelling_function as modelling
+import os
+import re
+import math
+import numpy as np
+from rapidfuzz import process, fuzz, utils
+from simpletransformers.classification import ClassificationModel
+from transformers import pipeline
+import gradio as gr
+# set working directory
+os.chdir('/Users/matthew.farant/dsw_solution')
+def is_nan(text):
+    try:
+        # Attempt to convert the text to a float
+        value = float(text)
+        # Check if it's NaN
+        return math.isnan(value)
+    except ValueError:
+        # If the conversion to float fails, it's not a NaN
+        return False
+# Function for preparing catalog
+def prepare_catalog():
+    # Load internal catalog
+    product_catalog = get.internal_data('catalog')
+    # Load external catalog
+    registered_fertilizers = get.registered_fertilizer_data()
+    # Product catalog cleaning
+    product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True)
+    product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
+    product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
+    # Removing Duplicates:
+    product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0)
+    # 1. Only take registered fertilizers that is NOT in the existing product catalog
+    registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80)
+    # 2. Combine product catalog and registered fertilizers
+    combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers')
+    # 3. Remove duplicates
+    combined_catalog = combined_catalog.drop_duplicates()
+    # Use lambda function to extract the formula from Registered Product column
+    combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x))
+    # if formula is empty list, then replace it with NaN, else take the first item in the formula list
+    combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0])
+    return combined_catalog
+# Your existing decision function
+def decision(user_input, catalog, product_name_catalog):
+    # Initialize the model
+    pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = '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')
+    # Extract formula
+    user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
+    user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
+    # Similar Product
+    catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
+    catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
+    # Take Top Similar Product. Take "Product Catalog" first
+    catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
+    # Condition
+    if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
+        return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
+    elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
+        return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
+    elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
+        return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
+    elif catalog['Similarity Score'].values[0] < 80:
+        if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
+            return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
+        else:
+            return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
+    else:
+        return "Product is not a Fertilizer"
+def app(input):
+    catalog = prepare_catalog()
+    return decision(input, catalog, "Registered Product")
+# Initialize the app
+demo = gr.Interface(
+    fn=app,
+    inputs="text",
+    outputs="text",
+    examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
+    title = 'Fertilizer Catalog Engine 🌽',
+    description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
+    article= """
+    ### About The App
+    This app is built as a part of the Data Science Weekend 2023 Challenge submission. This app aims to help fertilizer companies to map
+    free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
+    decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
+    the catalog. <br>
+    ### How Does it Work?
+    This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
+    When a product is not available in the catalog, we will use a IndoBERT model to determine if the product is a fertilizer and eligible to be
+    added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
+    model will be able to learn how fertilizer products (especially the local ones) look like. <br>
+    ### What is the Flags For?
+    The flag is a part of the Active Transfer Learning feature of this app. When a user flags a product as "Correct" or "Incorrect", the developer
+    will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
+    flagging the prediction result! <br>
+    ### I want to test multiple inputs at once!
+    You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
+    programmatically. <br>
+    """,
+    api_name='search',
+    flagging_options=["Correct","Incorrect"],
+    theme = gr.themes.Soft()
+    )
+# Run the app
+if __name__ == "__main__":
+    demo.launch(show_api=True)