Spaces:
Runtime error
Runtime error
| import functions.extract_function as get | |
| import functions.preprocessing_function as preprocess | |
| import functions.modelling_function as modelling | |
| import os | |
| import re | |
| import math | |
| import numpy as np | |
| from rapidfuzz import process, fuzz, utils | |
| from simpletransformers.classification import ClassificationModel | |
| from transformers import pipeline | |
| import gradio as gr | |
| # set current directory | |
| os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
| def is_nan(text): | |
| try: | |
| value = float(text) | |
| return math.isnan(value) | |
| except ValueError: | |
| return False | |
| # Function for preparing catalog | |
| def prepare_catalog(): | |
| # Load internal catalog | |
| product_catalog = get.internal_data('catalog') | |
| # Load external catalog | |
| registered_fertilizers = get.registered_fertilizer_data() | |
| # Product catalog cleaning | |
| product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True) | |
| product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
| product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
| # Removing Duplicates: | |
| product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0) | |
| # 1. Only take registered fertilizers that is NOT in the existing product catalog | |
| registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80) | |
| # 2. Combine product catalog and registered fertilizers | |
| combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers') | |
| # 3. Remove duplicates | |
| combined_catalog = combined_catalog.drop_duplicates() | |
| # Use lambda function to extract the formula from Registered Product column | |
| combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x)) | |
| # if formula is empty list, then replace it with NaN, else take the first item in the formula list | |
| combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0]) | |
| return combined_catalog | |
| # Your existing decision function | |
| def decision(user_input, type, catalog, product_name_catalog): | |
| # Initialize the model | |
| pipe_detect = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN')) | |
| pipe_match = pipeline("text-classification", model="matthewfarant/autotrain-fertilizer-pair-classify", token = os.getenv('HF_MY_TOKEN')) | |
| # Extract formula | |
| user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input) | |
| user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0] | |
| if type == 'Fuzzy Search': | |
| # Similar Product | |
| catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process)) | |
| catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process)) | |
| # Take Top Similar Product. Take "Product Catalog" first | |
| catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1) | |
| # Condition | |
| if catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
| return f"[1] Product is Available in Catalog (SKU Registered as *{catalog['Registered Product'].values[0]}*)" | |
| elif catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
| return f"[2] Add as New Product (Registered in Kementan as *{catalog['Registered Product'].values[0]}*)" | |
| elif catalog['Similarity Score'].values[0] >= 80 and catalog['Formula Similarity'].values[0] < 100: | |
| return f"[3] Add as New Product (Similar to *{catalog['Registered Product'].values[0]}* in {catalog['Source'].values[0]} but with different formula)" | |
| elif catalog['Similarity Score'].values[0] < 80: | |
| if pipe_detect(user_input)[0]['label'] == 'Fertilizer' and pipe_detect(user_input)[0]['score'] > 0.8: | |
| return f"[4] Add as New Product ({pipe_detect(user_input)[0]['score'] * 100}% probability of being a fertilizer)" | |
| else: | |
| return f"[5] Product might not be a Fertilizer ({np.round(pipe_detect(user_input)[0]['score'] * 100,2)}% probability of being a {pipe_detect(user_input)[0]['label']})" | |
| else: | |
| return "[6] Product is not a Fertilizer" | |
| elif type == 'Training Mode': | |
| # Same like above, but only match with catalog[catalog['Source'] == 'Product Catalog']['Registered Product'] | |
| catalog = catalog[catalog['Source'] == 'Product Catalog'] | |
| catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process)) | |
| catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process)) | |
| # Take Top Similar Product | |
| catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity'], ascending=False).head(1) | |
| return catalog['Registered Product'].values[0] | |
| elif type == 'Probabilistic Search': | |
| catalog = catalog[catalog['Source'] == 'Product Catalog'] | |
| # Based on probability | |
| catalog['Concat Input'] = user_input + '[SEP]' + catalog['Registered Product'].astype(str) | |
| catalog['Similarity Score'] = catalog['Concat Input'].apply(lambda x: pipe_match(x)[0]['score']) | |
| catalog = catalog.sort_values(by=['Similarity Score'], ascending=False).head(1) | |
| return f"{np.round(catalog['Similarity Score'].values[0] * 100,2)}% probability of being a {catalog['Registered Product'].values[0]}" | |
| def app(input, type): | |
| if input is None or type is None: | |
| return "Please fill in the input and select the search type" | |
| catalog = prepare_catalog() | |
| return decision(input, type, catalog, "Registered Product") | |
| # Initialize the app | |
| demo = gr.Interface( | |
| fn=app, | |
| inputs=[ | |
| gr.Textbox(), | |
| gr.Radio(["Fuzzy Search", "Probabilistic Search", "Training Mode"], type="value") | |
| ], | |
| outputs="text", | |
| examples= [ | |
| ['Petro Nitrat 16-16-16','Fuzzy Search'], | |
| ['Petro Nitrat 15-15-15','Fuzzy Search'], | |
| ['Gramoxone 1 Liter','Fuzzy Search'], | |
| ['Indomie Goreng Aceh','Fuzzy Search'] | |
| ], | |
| title = 'Fertilizer Catalog Engine 🌽', | |
| description = 'Catalog Search Engine and Decision Support System for Fertilizer Company', | |
| article= """ | |
| ### About The App | |
| This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map | |
| free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to | |
| decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to | |
| the catalog. <br> | |
| ### How Does it Work? | |
| This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not. | |
| When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be | |
| added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the | |
| model will be able to learn how fertilizer products (especially the local ones) look like. <br> | |
| ### What are the Flags For? | |
| The flag is a part of the "Active Transfer Learning" feature of this app when the user selects "Training Mode". When a user flags an output as "Correct" or "Incorrect", | |
| the developer will be able to fine-tune the model using the user's input, hence improving the model's performance when the user selects "Probabilistic Search". So, please | |
| help us to improve the model by flagging the prediction result 🙏 <br> | |
| ### I want to test multiple inputs at once! | |
| You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs | |
| programmatically. <br> | |
| """, | |
| api_name='search', | |
| allow_flagging='manual', | |
| flagging_options=["Correct","Incorrect"], | |
| flagging_dir='flagging/', | |
| theme = gr.themes.Soft() | |
| ) | |
| # Run the app | |
| if __name__ == "__main__": | |
| demo.launch(show_api=True) |