matthewfarant commited on
Commit
e5890ec
·
1 Parent(s): cb7b577

Initial commit

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functions.extract_function as get
2
+ import functions.preprocessing_function as preprocess
3
+ import functions.modelling_function as modelling
4
+
5
+ import os
6
+ import re
7
+ import math
8
+ import numpy as np
9
+ from rapidfuzz import process, fuzz, utils
10
+ from simpletransformers.classification import ClassificationModel
11
+ from transformers import pipeline
12
+ import gradio as gr
13
+
14
+ # set working directory
15
+ os.chdir('/Users/matthew.farant/dsw_solution')
16
+
17
+ def is_nan(text):
18
+ try:
19
+ # Attempt to convert the text to a float
20
+ value = float(text)
21
+ # Check if it's NaN
22
+ return math.isnan(value)
23
+ except ValueError:
24
+ # If the conversion to float fails, it's not a NaN
25
+ return False
26
+
27
+ # Function for preparing catalog
28
+ def prepare_catalog():
29
+ # Load internal catalog
30
+ product_catalog = get.internal_data('catalog')
31
+ # Load external catalog
32
+ registered_fertilizers = get.registered_fertilizer_data()
33
+ # Product catalog cleaning
34
+ product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True)
35
+ product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
36
+ product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
37
+ # Removing Duplicates:
38
+ product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0)
39
+ # 1. Only take registered fertilizers that is NOT in the existing product catalog
40
+ registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80)
41
+ # 2. Combine product catalog and registered fertilizers
42
+ combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers')
43
+ # 3. Remove duplicates
44
+ combined_catalog = combined_catalog.drop_duplicates()
45
+ # Use lambda function to extract the formula from Registered Product column
46
+ combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x))
47
+ # if formula is empty list, then replace it with NaN, else take the first item in the formula list
48
+ combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0])
49
+ return combined_catalog
50
+
51
+ # Your existing decision function
52
+ def decision(user_input, catalog, product_name_catalog):
53
+ # Initialize the model
54
+ pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = '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')
55
+
56
+ # Extract formula
57
+ user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
58
+ user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
59
+
60
+ # Similar Product
61
+ catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
62
+ catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
63
+
64
+ # Take Top Similar Product. Take "Product Catalog" first
65
+ catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
66
+
67
+ # Condition
68
+ if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
69
+ return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
70
+ elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
71
+ return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
72
+ elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
73
+ return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
74
+ elif catalog['Similarity Score'].values[0] < 80:
75
+ if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
76
+ return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
77
+ else:
78
+ return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
79
+ else:
80
+ return "Product is not a Fertilizer"
81
+
82
+ def app(input):
83
+ catalog = prepare_catalog()
84
+ return decision(input, catalog, "Registered Product")
85
+
86
+ # Initialize the app
87
+ demo = gr.Interface(
88
+ fn=app,
89
+ inputs="text",
90
+ outputs="text",
91
+ examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
92
+ title = 'Fertilizer Catalog Engine 🌽',
93
+ description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
94
+ article= """
95
+
96
+
97
+ ### About The App
98
+
99
+ This app is built as a part of the Data Science Weekend 2023 Challenge submission. This app aims to help fertilizer companies to map
100
+ free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
101
+ decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
102
+ the catalog. <br>
103
+
104
+ ### How Does it Work?
105
+
106
+ This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
107
+ When a product is not available in the catalog, we will use a IndoBERT model to determine if the product is a fertilizer and eligible to be
108
+ added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
109
+ model will be able to learn how fertilizer products (especially the local ones) look like. <br>
110
+
111
+ ### What is the Flags For?
112
+
113
+ The flag is a part of the Active Transfer Learning feature of this app. When a user flags a product as "Correct" or "Incorrect", the developer
114
+ will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
115
+ flagging the prediction result! <br>
116
+
117
+ ### I want to test multiple inputs at once!
118
+
119
+ You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
120
+ programmatically. <br>
121
+
122
+ """,
123
+ api_name='search',
124
+ flagging_options=["Correct","Incorrect"],
125
+ theme = gr.themes.Soft()
126
+ )
127
+
128
+ # Run the app
129
+
130
+ if __name__ == "__main__":
131
+ demo.launch(show_api=True)