Spaces:
Runtime error
Runtime error
Commit
·
e5890ec
1
Parent(s):
cb7b577
Initial commit
Browse files
app.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functions.extract_function as get
|
| 2 |
+
import functions.preprocessing_function as preprocess
|
| 3 |
+
import functions.modelling_function as modelling
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import math
|
| 8 |
+
import numpy as np
|
| 9 |
+
from rapidfuzz import process, fuzz, utils
|
| 10 |
+
from simpletransformers.classification import ClassificationModel
|
| 11 |
+
from transformers import pipeline
|
| 12 |
+
import gradio as gr
|
| 13 |
+
|
| 14 |
+
# set working directory
|
| 15 |
+
os.chdir('/Users/matthew.farant/dsw_solution')
|
| 16 |
+
|
| 17 |
+
def is_nan(text):
|
| 18 |
+
try:
|
| 19 |
+
# Attempt to convert the text to a float
|
| 20 |
+
value = float(text)
|
| 21 |
+
# Check if it's NaN
|
| 22 |
+
return math.isnan(value)
|
| 23 |
+
except ValueError:
|
| 24 |
+
# If the conversion to float fails, it's not a NaN
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
# Function for preparing catalog
|
| 28 |
+
def prepare_catalog():
|
| 29 |
+
# Load internal catalog
|
| 30 |
+
product_catalog = get.internal_data('catalog')
|
| 31 |
+
# Load external catalog
|
| 32 |
+
registered_fertilizers = get.registered_fertilizer_data()
|
| 33 |
+
# Product catalog cleaning
|
| 34 |
+
product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True)
|
| 35 |
+
product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
|
| 36 |
+
product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
|
| 37 |
+
# Removing Duplicates:
|
| 38 |
+
product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0)
|
| 39 |
+
# 1. Only take registered fertilizers that is NOT in the existing product catalog
|
| 40 |
+
registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80)
|
| 41 |
+
# 2. Combine product catalog and registered fertilizers
|
| 42 |
+
combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers')
|
| 43 |
+
# 3. Remove duplicates
|
| 44 |
+
combined_catalog = combined_catalog.drop_duplicates()
|
| 45 |
+
# Use lambda function to extract the formula from Registered Product column
|
| 46 |
+
combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x))
|
| 47 |
+
# if formula is empty list, then replace it with NaN, else take the first item in the formula list
|
| 48 |
+
combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0])
|
| 49 |
+
return combined_catalog
|
| 50 |
+
|
| 51 |
+
# Your existing decision function
|
| 52 |
+
def decision(user_input, catalog, product_name_catalog):
|
| 53 |
+
# Initialize the model
|
| 54 |
+
pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = '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')
|
| 55 |
+
|
| 56 |
+
# Extract formula
|
| 57 |
+
user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
|
| 58 |
+
user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
|
| 59 |
+
|
| 60 |
+
# Similar Product
|
| 61 |
+
catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
|
| 62 |
+
catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
|
| 63 |
+
|
| 64 |
+
# Take Top Similar Product. Take "Product Catalog" first
|
| 65 |
+
catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
|
| 66 |
+
|
| 67 |
+
# Condition
|
| 68 |
+
if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
|
| 69 |
+
return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
|
| 70 |
+
elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
|
| 71 |
+
return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
|
| 72 |
+
elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
|
| 73 |
+
return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
|
| 74 |
+
elif catalog['Similarity Score'].values[0] < 80:
|
| 75 |
+
if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
|
| 76 |
+
return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
|
| 77 |
+
else:
|
| 78 |
+
return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
|
| 79 |
+
else:
|
| 80 |
+
return "Product is not a Fertilizer"
|
| 81 |
+
|
| 82 |
+
def app(input):
|
| 83 |
+
catalog = prepare_catalog()
|
| 84 |
+
return decision(input, catalog, "Registered Product")
|
| 85 |
+
|
| 86 |
+
# Initialize the app
|
| 87 |
+
demo = gr.Interface(
|
| 88 |
+
fn=app,
|
| 89 |
+
inputs="text",
|
| 90 |
+
outputs="text",
|
| 91 |
+
examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
|
| 92 |
+
title = 'Fertilizer Catalog Engine 🌽',
|
| 93 |
+
description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
|
| 94 |
+
article= """
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
### About The App
|
| 98 |
+
|
| 99 |
+
This app is built as a part of the Data Science Weekend 2023 Challenge submission. This app aims to help fertilizer companies to map
|
| 100 |
+
free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
|
| 101 |
+
decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
|
| 102 |
+
the catalog. <br>
|
| 103 |
+
|
| 104 |
+
### How Does it Work?
|
| 105 |
+
|
| 106 |
+
This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
|
| 107 |
+
When a product is not available in the catalog, we will use a IndoBERT model to determine if the product is a fertilizer and eligible to be
|
| 108 |
+
added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
|
| 109 |
+
model will be able to learn how fertilizer products (especially the local ones) look like. <br>
|
| 110 |
+
|
| 111 |
+
### What is the Flags For?
|
| 112 |
+
|
| 113 |
+
The flag is a part of the Active Transfer Learning feature of this app. When a user flags a product as "Correct" or "Incorrect", the developer
|
| 114 |
+
will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
|
| 115 |
+
flagging the prediction result! <br>
|
| 116 |
+
|
| 117 |
+
### I want to test multiple inputs at once!
|
| 118 |
+
|
| 119 |
+
You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
|
| 120 |
+
programmatically. <br>
|
| 121 |
+
|
| 122 |
+
""",
|
| 123 |
+
api_name='search',
|
| 124 |
+
flagging_options=["Correct","Incorrect"],
|
| 125 |
+
theme = gr.themes.Soft()
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Run the app
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
demo.launch(show_api=True)
|