Spaces:
Runtime error
Runtime error
Commit ·
b279c69
1
Parent(s): 484b915
Initial commit
Browse files- functions/extract_function.py +101 -0
- functions/modelling_function.py +191 -0
- functions/preprocessing_function.py +203 -0
functions/extract_function.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
import requests
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
def internal_data(type):
|
| 7 |
+
"""
|
| 8 |
+
Extract internal data from either catalog or query.
|
| 9 |
+
|
| 10 |
+
:param type: str, 'catalog' or 'query'
|
| 11 |
+
|
| 12 |
+
:return: pandas.DataFrame, dataframe containing product name and category name
|
| 13 |
+
"""
|
| 14 |
+
if type == 'catalog':
|
| 15 |
+
dfs = []
|
| 16 |
+
for file in os.listdir('catalog'):
|
| 17 |
+
if file.endswith('.xlsx'):
|
| 18 |
+
df = pd.read_excel('catalog/' + file)
|
| 19 |
+
dfs.append(df)
|
| 20 |
+
catalog = pd.concat(dfs, ignore_index=True)
|
| 21 |
+
return catalog
|
| 22 |
+
|
| 23 |
+
elif type == 'query':
|
| 24 |
+
dfs = []
|
| 25 |
+
for file in os.listdir('query'):
|
| 26 |
+
if file.endswith('.xlsx'):
|
| 27 |
+
df = pd.read_excel('query/' + file)
|
| 28 |
+
dfs.append(df)
|
| 29 |
+
query = pd.concat(dfs, ignore_index=True)
|
| 30 |
+
return query
|
| 31 |
+
|
| 32 |
+
else:
|
| 33 |
+
return 'Error: type must be either catalog or query'
|
| 34 |
+
|
| 35 |
+
def registered_fertilizer_data():
|
| 36 |
+
"""
|
| 37 |
+
Scrape registered fertilizer data in Ministry of Agriculture website.
|
| 38 |
+
|
| 39 |
+
:param type: str, 'organik' or 'anorganik'
|
| 40 |
+
|
| 41 |
+
:return: pandas.DataFrame, dataframe containing registered fertilizer data
|
| 42 |
+
"""
|
| 43 |
+
# check if the "external" folder is empty
|
| 44 |
+
if os.listdir('external') == []:
|
| 45 |
+
print('External folder is empty. Extracting data from Ministry of Agriculture website...')
|
| 46 |
+
print('Extracting Organic Fertilizer Data...')
|
| 47 |
+
dfs1 = []
|
| 48 |
+
# Scrape every table in every page: Organic
|
| 49 |
+
i = 1
|
| 50 |
+
while True:
|
| 51 |
+
url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
|
| 52 |
+
result = requests.get(url).content
|
| 53 |
+
try:
|
| 54 |
+
df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
|
| 55 |
+
df['Page Number'] = i
|
| 56 |
+
dfs1.append(df)
|
| 57 |
+
i += 1
|
| 58 |
+
except IndexError:
|
| 59 |
+
break
|
| 60 |
+
|
| 61 |
+
registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
|
| 62 |
+
|
| 63 |
+
print('Extracting Inorganic Fertilizer Data...')
|
| 64 |
+
dfs2 = []
|
| 65 |
+
# Scrape every table in every page: Inorganic
|
| 66 |
+
i = 1
|
| 67 |
+
while True:
|
| 68 |
+
url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
|
| 69 |
+
result = requests.get(url).content
|
| 70 |
+
try:
|
| 71 |
+
df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
|
| 72 |
+
df['Page Number'] = i
|
| 73 |
+
dfs2.append(df)
|
| 74 |
+
i += 1
|
| 75 |
+
except IndexError:
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()
|
| 79 |
+
|
| 80 |
+
registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
|
| 81 |
+
registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
|
| 82 |
+
return registered_fertilizers
|
| 83 |
+
|
| 84 |
+
else :
|
| 85 |
+
return pd.read_csv('external/registered_fertilizers.csv')
|
| 86 |
+
|
| 87 |
+
def scrape_result():
|
| 88 |
+
"""
|
| 89 |
+
Extract scraped result data.
|
| 90 |
+
|
| 91 |
+
:return: pandas.DataFrame, dataframe containing scraped result data
|
| 92 |
+
"""
|
| 93 |
+
dfs = []
|
| 94 |
+
|
| 95 |
+
for filename in os.listdir('scrape_result'):
|
| 96 |
+
df = pd.read_csv('scrape_result/'+filename)
|
| 97 |
+
dfs.append(df)
|
| 98 |
+
|
| 99 |
+
# combine
|
| 100 |
+
final_df = pd.concat(dfs, ignore_index=True)
|
| 101 |
+
return final_df
|
functions/modelling_function.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import yaml
|
| 6 |
+
import os
|
| 7 |
+
import warnings
|
| 8 |
+
from rapidfuzz import fuzz, utils
|
| 9 |
+
from simpletransformers.classification import ClassificationModel, ClassificationArgs
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 12 |
+
from scipy.special import softmax
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def generate_training_data(df, text_column, label_column, external_table = None, external_column = None, add_external_table=False, sampling=True):
|
| 16 |
+
"""
|
| 17 |
+
This function generates training data for the model.
|
| 18 |
+
|
| 19 |
+
:param df: pandas.DataFrame, dataframe containing product name and category name
|
| 20 |
+
:param text_column: str, column name containing product name
|
| 21 |
+
:param label_column: str, column name containing category name
|
| 22 |
+
:param external_table: pandas.DataFrame, dataframe containing product name and category name
|
| 23 |
+
:param external_column: str, column name containing product name
|
| 24 |
+
:param add_external_table: bool, whether to add external table or not
|
| 25 |
+
:param sampling: bool, whether to do sampling or not
|
| 26 |
+
|
| 27 |
+
:return: pandas.DataFrame, dataframe containing product name and category name
|
| 28 |
+
"""
|
| 29 |
+
if os.listdir('training') == []:
|
| 30 |
+
print('Training folder is empty. Generating training data...')
|
| 31 |
+
units = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['excluded_words']
|
| 32 |
+
|
| 33 |
+
df['category_name'] = df[label_column].apply(lambda x: 'Fertilizer - High' if isinstance(x, list) and len(x) == 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - High' if isinstance(x, list) and len(x) == 1 and 'Weeds & Pest Control' in x else 'Fertilizer - Medium' if isinstance(x, list) and len(x) > 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - Medium' if isinstance(x, list) and len(x) > 1 and 'Weeds & Pest Control' in x else 'Others')
|
| 34 |
+
df = df[[text_column, 'category_name']]
|
| 35 |
+
|
| 36 |
+
# take only where category_name is Ferilizer - High or Pesticide - High or Others
|
| 37 |
+
df = df[df['category_name'].isin(['Fertilizer - High', 'Pesticide - High', 'Others'])]
|
| 38 |
+
# exclude product name that contains units AND category_name is Others
|
| 39 |
+
df = df[~(df[text_column].str.contains('|'.join(units)) & (df['category_name'] == 'Others'))]
|
| 40 |
+
|
| 41 |
+
if add_external_table:
|
| 42 |
+
external_table['category_name'] = 'Fertilizer - High'
|
| 43 |
+
external_table = external_table[[external_column, 'category_name']]
|
| 44 |
+
external_table.columns = [text_column, 'category_name']
|
| 45 |
+
|
| 46 |
+
training_df = pd.concat([external_table, df])
|
| 47 |
+
training_df.columns = ['product_name','category_name']
|
| 48 |
+
|
| 49 |
+
training_df['category_name'] = training_df['category_name'].apply(lambda x: 0 if x == 'Fertilizer - High' else 1 if x == 'Pesticide - High' else 2)
|
| 50 |
+
if sampling:
|
| 51 |
+
return pd.concat([training_df[training_df['category_name'] == 0].sample(n=1250), training_df[training_df['category_name'] == 1].sample(n=1250), training_df[training_df['category_name'] == 2].sample(n=1500)])
|
| 52 |
+
else:
|
| 53 |
+
return training_df
|
| 54 |
+
else:
|
| 55 |
+
return df
|
| 56 |
+
else:
|
| 57 |
+
training_df = pd.read_csv('training/training_data.csv')
|
| 58 |
+
return training_df
|
| 59 |
+
|
| 60 |
+
def category_reassign(row, reference_df, checked_category, threshold=70):
|
| 61 |
+
"""
|
| 62 |
+
This function reassigns the category name of a product based on the similarity score between the product name and the reference dataframe.
|
| 63 |
+
|
| 64 |
+
:param row: pandas.Series, row of dataframe
|
| 65 |
+
:param reference_df: pandas.DataFrame, dataframe containing product name and category name
|
| 66 |
+
:param checked_category: str, category name to be checked
|
| 67 |
+
:param threshold: int, threshold for similarity score
|
| 68 |
+
|
| 69 |
+
:return: str, category name
|
| 70 |
+
"""
|
| 71 |
+
if row['category_name'] == checked_category:
|
| 72 |
+
for i in range(len(reference_df)):
|
| 73 |
+
row2 = reference_df.iloc[i]
|
| 74 |
+
if row2['category_name'] != checked_category:
|
| 75 |
+
if fuzz.ratio(row['product_name'], row2['product_name'], processor= utils.default_process) >= threshold:
|
| 76 |
+
return row2['category_name']
|
| 77 |
+
return checked_category
|
| 78 |
+
else:
|
| 79 |
+
return row['category_name']
|
| 80 |
+
|
| 81 |
+
def train_model(df, stratify=True, model_type='bert', use_existing_model=False, model_name=None):
|
| 82 |
+
"""
|
| 83 |
+
This function trains the model using the configuration in config.yaml
|
| 84 |
+
|
| 85 |
+
:param df: pandas.DataFrame, dataframe containing product name and category name
|
| 86 |
+
:param stratify: bool, whether to do stratified sampling or not
|
| 87 |
+
:param model_type: str, type of model to use
|
| 88 |
+
:param use_existing_model: bool, whether to use existing model or not
|
| 89 |
+
:param model_name: str, name of existing model
|
| 90 |
+
|
| 91 |
+
:return: simpletransformers.classification.ClassificationModel, model
|
| 92 |
+
:return: numpy.ndarray, predictions
|
| 93 |
+
:return: str, classification report
|
| 94 |
+
:return: pandas.DataFrame, training dataframe
|
| 95 |
+
:return: pandas.DataFrame, testing dataframe
|
| 96 |
+
:return: list, list of class names
|
| 97 |
+
"""
|
| 98 |
+
warnings.filterwarnings('ignore')
|
| 99 |
+
|
| 100 |
+
test_size = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['training_args']['test_size']
|
| 101 |
+
train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['category_name'])
|
| 102 |
+
|
| 103 |
+
# Optional model configuration
|
| 104 |
+
model_config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_args']
|
| 105 |
+
model_args = ClassificationArgs()
|
| 106 |
+
model_args.num_train_epochs = model_config['num_train_epochs']
|
| 107 |
+
model_args.train_batch_size = model_config['train_batch_size']
|
| 108 |
+
model_args.eval_batch_size = model_config['eval_batch_size']
|
| 109 |
+
model_args.overwrite_output_dir = model_config['overwrite_output_dir']
|
| 110 |
+
model_args.fp16 = model_config['fp16']
|
| 111 |
+
model_args.do_lower_case = model_config['do_lower_case']
|
| 112 |
+
|
| 113 |
+
# Create a ClassificationModel
|
| 114 |
+
model_detail = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_types']
|
| 115 |
+
class_names = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['class_names']
|
| 116 |
+
|
| 117 |
+
if use_existing_model:
|
| 118 |
+
model = ClassificationModel(model_type, model_name, num_labels=len(class_names), args=model_args, use_cuda=False)
|
| 119 |
+
else:
|
| 120 |
+
model = ClassificationModel(model_type, model_detail[model_type], num_labels=len(class_names), args=model_args, use_cuda=False)
|
| 121 |
+
|
| 122 |
+
# Train the model
|
| 123 |
+
model.train_model(train_df)
|
| 124 |
+
|
| 125 |
+
# Evaluate the model
|
| 126 |
+
result, model_outputs, wrong_predictions = model.eval_model(test_df)
|
| 127 |
+
preds = np.argmax(model_outputs, axis=1)
|
| 128 |
+
class_report =classification_report(test_df['category_name'], preds, target_names=class_names)
|
| 129 |
+
|
| 130 |
+
return model, preds, class_report, train_df, test_df, class_names
|
| 131 |
+
|
| 132 |
+
def save_model(model, model_name):
|
| 133 |
+
"""
|
| 134 |
+
This function saves the model.
|
| 135 |
+
|
| 136 |
+
:param model: simpletransformers.classification.ClassificationModel, model
|
| 137 |
+
:param model_name: str, name of model
|
| 138 |
+
|
| 139 |
+
:return: None
|
| 140 |
+
"""
|
| 141 |
+
model.model.save_pretrained(model_name)
|
| 142 |
+
model.tokenizer.save_pretrained(model_name)
|
| 143 |
+
model.config.save_pretrained(model_name + '/')
|
| 144 |
+
print('Model saved to ' + model_name + '/')
|
| 145 |
+
|
| 146 |
+
def show_confusion_matrix(test_category, preds, class_names):
|
| 147 |
+
"""
|
| 148 |
+
This function shows the confusion matrix.
|
| 149 |
+
|
| 150 |
+
:param test_category: numpy.ndarray, array of category name
|
| 151 |
+
:param preds: numpy.ndarray, array of predictions
|
| 152 |
+
:param class_names: list, list of class names
|
| 153 |
+
|
| 154 |
+
:return: matplotlib.axes._subplots.AxesSubplot, confusion matrix
|
| 155 |
+
"""
|
| 156 |
+
cm = confusion_matrix(test_category, preds)
|
| 157 |
+
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
|
| 158 |
+
hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
|
| 159 |
+
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
|
| 160 |
+
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
|
| 161 |
+
plt.ylabel('True Topics')
|
| 162 |
+
plt.xlabel('Predicted Topics')
|
| 163 |
+
|
| 164 |
+
def predict_proba(model,text):
|
| 165 |
+
"""
|
| 166 |
+
This function predicts the probability of each class (in a text form).
|
| 167 |
+
|
| 168 |
+
:param model: simpletransformers.classification.ClassificationModel, model
|
| 169 |
+
:param text: str, text to predict
|
| 170 |
+
|
| 171 |
+
:return: numpy.ndarray, array of probabilities
|
| 172 |
+
"""
|
| 173 |
+
proba = softmax(model.predict([text])[1])[0]
|
| 174 |
+
print('-----------------------------')
|
| 175 |
+
print('Text to Predict: ', text)
|
| 176 |
+
print('Probability of each class:')
|
| 177 |
+
print('Fertilizer: ', proba[0])
|
| 178 |
+
print('Pesticide: ', proba[1])
|
| 179 |
+
print('Others: ', proba[2])
|
| 180 |
+
|
| 181 |
+
def predict_proba_array(model,text):
|
| 182 |
+
"""
|
| 183 |
+
This function predicts the probability of each class (in an array form).
|
| 184 |
+
|
| 185 |
+
:param model: simpletransformers.classification.ClassificationModel, model
|
| 186 |
+
:param text: str, text to predict
|
| 187 |
+
|
| 188 |
+
:return: numpy.ndarray, array of probabilities
|
| 189 |
+
"""
|
| 190 |
+
proba = softmax(model.predict([text])[1])[0]
|
| 191 |
+
return proba
|
functions/preprocessing_function.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from rapidfuzz import process, fuzz, utils
|
| 4 |
+
|
| 5 |
+
def clean_dataframe(df, column, remove_na=True, remove_non_words=True, remove_symbols=True, remove_duplicates=True):
|
| 6 |
+
"""
|
| 7 |
+
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
|
| 8 |
+
|
| 9 |
+
Parameters:
|
| 10 |
+
df (pandas.DataFrame): The dataframe to clean.
|
| 11 |
+
column (str): The column to clean.
|
| 12 |
+
remove_na (bool): Whether to remove NaN or not.
|
| 13 |
+
remove_non_words (bool): Whether to remove non-words or not.
|
| 14 |
+
remove_symbols (bool): Whether to remove symbols or not.
|
| 15 |
+
remove_duplicates (bool): Whether to remove duplicates or not.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
pandas.DataFrame: The cleaned dataframe.
|
| 19 |
+
"""
|
| 20 |
+
# Lowercase the column
|
| 21 |
+
df[column + ' Clean'] = df[column].apply(lambda x: str(x).lower())
|
| 22 |
+
|
| 23 |
+
# Remove non words (symbols, numbers, etc.)
|
| 24 |
+
if remove_non_words:
|
| 25 |
+
df[column + ' Clean'] = ''
|
| 26 |
+
for i in range(len(df)):
|
| 27 |
+
row = df.iloc[i]
|
| 28 |
+
clean_word_list = []
|
| 29 |
+
for word in str(row[column]).lower().split():
|
| 30 |
+
if not any(char.isdigit() for char in word):
|
| 31 |
+
clean_word_list.append(word)
|
| 32 |
+
df.at[i, column + ' Clean'] = ' '.join(clean_word_list)
|
| 33 |
+
|
| 34 |
+
# Remove symbols, but keep numbers
|
| 35 |
+
if remove_symbols:
|
| 36 |
+
df[column + ' Clean'] = df[column + ' Clean'].apply(lambda x: ''.join(letter for letter in x if letter.isalnum() or letter.isspace()))
|
| 37 |
+
|
| 38 |
+
# Drop if the new column is NaN or empty string (when the whitespace is removed, it is '')
|
| 39 |
+
if remove_na:
|
| 40 |
+
df = df[df[column + ' Clean'].notna()]
|
| 41 |
+
df = df[df[column + ' Clean'].replace(' ','') != '']
|
| 42 |
+
|
| 43 |
+
# Remove duplicates
|
| 44 |
+
if remove_duplicates:
|
| 45 |
+
df = df.drop_duplicates(subset=[column + ' Clean'])
|
| 46 |
+
|
| 47 |
+
return df
|
| 48 |
+
|
| 49 |
+
def fuzzy_join(row, df_reference, column_reference, column_matched_to, take_regist_number=False, take_source = False, set_ratio_weight=0.5, ratio_weight=0.5):
|
| 50 |
+
"""
|
| 51 |
+
This function applies fuzzy join to the given row and returns the matched product name and nomor pendaftaran
|
| 52 |
+
based on the maximum similarity score between the two columns.
|
| 53 |
+
|
| 54 |
+
Parameters:
|
| 55 |
+
row (pandas.Series): The row to apply fuzzy join on.
|
| 56 |
+
df_reference (pandas.DataFrame): The dataframe to compare with.
|
| 57 |
+
column_reference (str): The column to use for fuzzy join.
|
| 58 |
+
column_matched_to (str): The column to compare with.
|
| 59 |
+
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
|
| 60 |
+
set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
|
| 61 |
+
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
|
| 65 |
+
"""
|
| 66 |
+
similar_product_name = ''
|
| 67 |
+
similarity_score = 0
|
| 68 |
+
nomor_pendaftaran = ''
|
| 69 |
+
source = ''
|
| 70 |
+
for product_name in df_reference[column_reference]:
|
| 71 |
+
if set_ratio_weight == 0:
|
| 72 |
+
score = fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
|
| 73 |
+
elif ratio_weight == 0:
|
| 74 |
+
score = fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process)
|
| 75 |
+
else:
|
| 76 |
+
score = set_ratio_weight * fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) + ratio_weight * fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
|
| 77 |
+
|
| 78 |
+
if score > similarity_score:
|
| 79 |
+
similarity_score = score
|
| 80 |
+
similar_product_name = product_name
|
| 81 |
+
if take_regist_number:
|
| 82 |
+
nomor_pendaftaran = df_reference[df_reference[column_reference] == product_name]['Nomor Pendaftaran'].iloc[0]
|
| 83 |
+
if take_source:
|
| 84 |
+
source = df_reference[df_reference[column_reference] == product_name]['Source'].iloc[0]
|
| 85 |
+
|
| 86 |
+
if take_regist_number and take_source:
|
| 87 |
+
return similar_product_name, similarity_score, nomor_pendaftaran, source
|
| 88 |
+
elif take_regist_number:
|
| 89 |
+
return similar_product_name, similarity_score, nomor_pendaftaran
|
| 90 |
+
elif take_source:
|
| 91 |
+
return similar_product_name, similarity_score, source
|
| 92 |
+
else:
|
| 93 |
+
return similar_product_name, similarity_score
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def fuzzy_join_compare(df, first_column, second_column, registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0):
|
| 97 |
+
"""
|
| 98 |
+
This function applies fuzzy join to the given dataframe and returns the matched product name and nomor pendaftaran
|
| 99 |
+
based on the maximum similarity score between the two columns.
|
| 100 |
+
|
| 101 |
+
Parameters:
|
| 102 |
+
df (pandas.DataFrame): The dataframe to apply fuzzy join on.
|
| 103 |
+
first_column (str): The first column to use for fuzzy join.
|
| 104 |
+
second_column (str): The second column to compare with.
|
| 105 |
+
registered_fertilizers (pandas.DataFrame): The dataframe containing the registered fertilizers.
|
| 106 |
+
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
|
| 107 |
+
set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
|
| 108 |
+
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
|
| 112 |
+
"""
|
| 113 |
+
df['Matched Product Name 1'], df['Similarity Score 1'], df['Nomor Pendaftaran 1'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', first_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
|
| 114 |
+
df['Matched Product Name 2'], df['Similarity Score 2'], df['Nomor Pendaftaran 2'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', second_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
|
| 115 |
+
|
| 116 |
+
# Take the maximum similarity score and take the matched product name and nomor pendaftaran based on that
|
| 117 |
+
df['Max Similarity Score'] = df[['Similarity Score 1', 'Similarity Score 2']].max(axis=1)
|
| 118 |
+
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the matched product name 1 as matched product name, else take matched product name 2
|
| 119 |
+
df['Matched Product Name'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Matched Product Name 1'], df['Matched Product Name 2'])
|
| 120 |
+
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the nomor pendaftaran 1 as nomor pendaftaran, else take nomor pendaftaran 2
|
| 121 |
+
df['Nomor Pendaftaran'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Nomor Pendaftaran 1'], df['Nomor Pendaftaran 2'])
|
| 122 |
+
# Remove the columns that are no longer needed such as the matched product name 1 and 2, similarity score 1 and 2, and nomor pendaftaran 1 and 2
|
| 123 |
+
df.drop(columns=['Matched Product Name 1', 'Matched Product Name 2', 'Similarity Score 1', 'Similarity Score 2', 'Nomor Pendaftaran 1', 'Nomor Pendaftaran 2'], inplace=True)
|
| 124 |
+
|
| 125 |
+
return df
|
| 126 |
+
|
| 127 |
+
def slice_with_filter(df, column, ref_df, use_filter=False, filter_condition=None):
|
| 128 |
+
"""
|
| 129 |
+
This function slices the given dataframe based on the given reference dataframe.
|
| 130 |
+
|
| 131 |
+
:param df: pandas.DataFrame, dataframe to be sliced
|
| 132 |
+
:param column: str, column to be sliced
|
| 133 |
+
:param ref_df: pandas.DataFrame, reference dataframe
|
| 134 |
+
:param use_filter: bool, whether to use filter or not
|
| 135 |
+
:param filter_condition: str, filter condition
|
| 136 |
+
|
| 137 |
+
:return: pandas.DataFrame, sliced dataframe
|
| 138 |
+
"""
|
| 139 |
+
if use_filter:
|
| 140 |
+
ref_df = ref_df[filter_condition]
|
| 141 |
+
|
| 142 |
+
return df[~df[column].isin(ref_df[column].to_list())]
|
| 143 |
+
|
| 144 |
+
def combine_catalog(column_1, column_2, source_1, source_2):
|
| 145 |
+
"""
|
| 146 |
+
This function combines two columns into one dataframe.
|
| 147 |
+
|
| 148 |
+
:param column_1: pandas.Series, first column
|
| 149 |
+
:param column_2: pandas.Series, second column
|
| 150 |
+
:param source_1: str, source of first column
|
| 151 |
+
:param source_2: str, source of second column
|
| 152 |
+
|
| 153 |
+
:return: pandas.DataFrame, combined dataframe
|
| 154 |
+
"""
|
| 155 |
+
combined_catalog = pd.concat([column_1, column_2])
|
| 156 |
+
combined_catalog = combined_catalog.to_frame(name='Registered Product')
|
| 157 |
+
combined_catalog['Source'] = pd.concat([column_1.apply(lambda x: source_1), column_2.apply(lambda x: source_2)])
|
| 158 |
+
combined_catalog.reset_index(drop=True, inplace=True)
|
| 159 |
+
|
| 160 |
+
return combined_catalog
|
| 161 |
+
|
| 162 |
+
def clean_category_dataframe(df, category_column, product_name_column, reference_table, reference_column, split=False):
|
| 163 |
+
"""
|
| 164 |
+
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
|
| 165 |
+
|
| 166 |
+
Parameters:
|
| 167 |
+
df (pandas.DataFrame): The dataframe to clean.
|
| 168 |
+
category_column (str): The column containing category name.
|
| 169 |
+
product_name_column (str): The column containing product name.
|
| 170 |
+
reference_table (pandas.DataFrame): The reference table to be used for fuzzy join.
|
| 171 |
+
reference_column (str): The column to be used for fuzzy join.
|
| 172 |
+
split (bool): Whether to split the dataframe into two or not.
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
pandas.DataFrame: The cleaned dataframe.
|
| 176 |
+
"""
|
| 177 |
+
# If column does not contain "Category", fill it with "Unknown" (including those that are NaN)
|
| 178 |
+
df[category_column] = df[category_column].apply(lambda x: x if isinstance(x, str) and 'Category' in x else 'Unknown')
|
| 179 |
+
# If column contains "Category", remove the word "Category" and replace "\n" with ","
|
| 180 |
+
df[category_column] = df[category_column].apply(lambda x: x.replace('Category', '').replace('\n', ',') if isinstance(x, str) else x)
|
| 181 |
+
# Replace "Lihat Lebih Banyak" with empty string
|
| 182 |
+
df[category_column] = df[category_column].apply(lambda x: x.replace('Lihat Lebih Banyak', '') if isinstance(x, str) else x)
|
| 183 |
+
# Add category_list column
|
| 184 |
+
df['category_list'] = df[category_column].apply(lambda x: x.split(',') if isinstance(x, str) else x)
|
| 185 |
+
# Add product_name_clean
|
| 186 |
+
df['product_name_clean'] = df[product_name_column].apply(lambda x: str(x).lower().strip())
|
| 187 |
+
# Remove duplicates
|
| 188 |
+
df = df.drop_duplicates(subset=['product_name_clean'], keep = 'last')
|
| 189 |
+
# Left join with product query
|
| 190 |
+
df_reference = reference_table.merge(df[['product_name_clean','category_list']], how='left', left_on=reference_table[reference_column].str.lower().str.strip(), right_on=df['product_name_clean'])
|
| 191 |
+
# convert category_list that contains 'Unknown' to NaN
|
| 192 |
+
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: np.nan if isinstance(x, list) and 'Unknown' in x else x)
|
| 193 |
+
# if the list in category_list contains empty string element, drop that element from the list
|
| 194 |
+
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i for i in x if i != ''] if isinstance(x, list) else x)
|
| 195 |
+
# Choose final columns
|
| 196 |
+
df_reference = df_reference[['Product Name', 'Product Name Clean', 'category_list']]
|
| 197 |
+
# Strip
|
| 198 |
+
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i.strip() for i in x] if isinstance(x, list) else x)
|
| 199 |
+
|
| 200 |
+
if split:
|
| 201 |
+
return df_reference, df_reference.dropna(subset=['category_list'])
|
| 202 |
+
else:
|
| 203 |
+
return df_reference
|