PixelPoppie
commited on
Commit
·
9fbbb5c
1
Parent(s):
62ee460
updated model, updated app, only used tf-idf
Browse files- MfrID_encoder.pkl +2 -2
- SPSC_encoder.pkl +2 -2
- app.py +54 -220
- description_engineering_models.pkl +0 -3
- requirements.txt +2 -4
- setup.sh +1 -3
- w2v_model.model.syn1neg.npy → svd_model.joblib +2 -2
- svd_model.pkl +0 -3
- tfidf_vectorizer.pkl → tfidf_vectorizer.joblib +2 -2
- w2v_model.model +0 -3
- w2v_model.model.wv.vectors.npy +0 -3
MfrID_encoder.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a687c15c69eeafc5c1145fd198ed9cb06d71c5732b06239c942744df1b15e79
|
| 3 |
+
size 8730949
|
SPSC_encoder.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23b27e44a468816577f36b6ee4ee835fb12bdb57bcab2792fd0bae737c1a5f3a
|
| 3 |
+
size 5
|
app.py
CHANGED
|
@@ -10,333 +10,170 @@ import nltk
|
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
import logging
|
| 13 |
-
import
|
| 14 |
-
import os
|
| 15 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 16 |
from sklearn.decomposition import TruncatedSVD
|
| 17 |
from nltk.tokenize import word_tokenize
|
| 18 |
from huggingface_hub import hf_hub_download
|
| 19 |
|
| 20 |
# Download the 'punkt' tokenizer
|
| 21 |
-
nltk.download('punkt')
|
| 22 |
|
| 23 |
# Set up logging
|
| 24 |
logging.basicConfig(level=logging.DEBUG)
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
| 27 |
-
# Download NLTK data
|
| 28 |
-
nltk.download('punkt')
|
| 29 |
-
|
| 30 |
# Get Hugging Face access token from environment variable
|
| 31 |
access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
|
| 32 |
|
| 33 |
# Set the repository IDs
|
| 34 |
-
model_repo_id = 'apoppie/
|
| 35 |
|
| 36 |
# Load the trained Random Forest model from Hugging Face Hub
|
| 37 |
model_file = hf_hub_download(
|
| 38 |
repo_id=model_repo_id,
|
| 39 |
-
filename='
|
| 40 |
use_auth_token=access_token,
|
| 41 |
resume_download=True
|
| 42 |
)
|
| 43 |
-
|
| 44 |
-
rf_model = pickle.load(f)
|
| 45 |
|
| 46 |
-
#
|
| 47 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 48 |
|
| 49 |
-
|
| 50 |
-
mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
|
| 51 |
with open(mfrid_encoder_path, 'rb') as f:
|
| 52 |
mfrid_encoder = pickle.load(f)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
|
| 56 |
with open(spsc_encoder_path, 'rb') as f:
|
| 57 |
spsc_encoder = pickle.load(f)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
svd_model = pickle.load(f)
|
| 62 |
-
|
| 63 |
-
# Load TF-IDF vectorizer
|
| 64 |
-
with open(os.path.join(current_dir, 'tfidf_vectorizer.pkl'), 'rb') as f:
|
| 65 |
-
tfidf_vectorizer = pickle.load(f)
|
| 66 |
-
|
| 67 |
-
# Load Word2Vec model
|
| 68 |
-
w2v_model = gensim.models.Word2Vec.load(os.path.join(current_dir, 'w2v_model.model'))
|
| 69 |
-
|
| 70 |
-
def binarize_spsc(spsc_codes, spsc_encoder):
|
| 71 |
-
"""Convert list of SPSC codes to binary values using the provided encoder"""
|
| 72 |
-
out = []
|
| 73 |
-
format_string = spsc_encoder['format_string']
|
| 74 |
-
bits = spsc_encoder['bits']
|
| 75 |
-
for v in spsc_codes:
|
| 76 |
-
try:
|
| 77 |
-
v = int(v) if not pd.isna(v) else 0 # Handle NaN
|
| 78 |
-
enc = format_string.format(v)
|
| 79 |
-
out.append([int(c) for c in enc])
|
| 80 |
-
except Exception:
|
| 81 |
-
out.append([0] * bits) # Use zero vector for invalid entries
|
| 82 |
-
return out
|
| 83 |
-
|
| 84 |
-
def encode_mfrid(mfrid_list, mfrid_encoder):
|
| 85 |
-
"""Encode MfrID using the provided mapping and convert to binary"""
|
| 86 |
-
mfrid_to_int = mfrid_encoder # Assuming mfrid_encoder is a dict
|
| 87 |
-
bits = max(mfrid_to_int.values()).bit_length()
|
| 88 |
-
mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
|
| 89 |
-
encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
|
| 90 |
-
binary_encoded = [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
|
| 91 |
-
return binary_encoded
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
|
| 96 |
|
| 97 |
def preprocess_json_content(content):
|
| 98 |
-
logger.debug("Original content: %s", content[:100]) # Log first 100 characters
|
| 99 |
-
|
| 100 |
-
# Remove any leading/trailing whitespace
|
| 101 |
content = content.strip()
|
| 102 |
-
logger.debug("After stripping: %s", content[:100])
|
| 103 |
-
|
| 104 |
-
# Handle case where content is wrapped in extra quotes
|
| 105 |
if content.startswith('"') and content.endswith('"'):
|
| 106 |
content = content[1:-1]
|
| 107 |
-
logger.debug("After removing extra quotes: %s", content[:100])
|
| 108 |
-
|
| 109 |
-
# Ensure content is wrapped in square brackets
|
| 110 |
if not content.startswith('['):
|
| 111 |
content = '[' + content
|
| 112 |
if not content.endswith(']'):
|
| 113 |
content = content + ']'
|
| 114 |
-
logger.debug("After adding brackets: %s", content[:100])
|
| 115 |
-
|
| 116 |
-
# Replace any trailing commas before closing brackets
|
| 117 |
content = re.sub(r',\s*}', '}', content)
|
| 118 |
content = re.sub(r',\s*]', ']', content)
|
| 119 |
-
logger.debug("After removing trailing commas: %s", content[:100])
|
| 120 |
-
|
| 121 |
return content
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
def parse_json_line_by_line(content):
|
| 124 |
data = []
|
| 125 |
-
for
|
| 126 |
line = line.strip()
|
| 127 |
if line:
|
| 128 |
try:
|
| 129 |
-
# Handle lines that start with comma
|
| 130 |
if line.startswith(','):
|
| 131 |
line = line[1:]
|
| 132 |
-
# Handle lines that are wrapped in square brackets
|
| 133 |
if line.startswith('[') and line.endswith(']'):
|
| 134 |
line = line[1:-1]
|
| 135 |
obj = json.loads(line)
|
| 136 |
data.append(obj)
|
| 137 |
except json.JSONDecodeError:
|
| 138 |
-
logger.warning(f"Failed to parse line
|
| 139 |
return data
|
| 140 |
|
| 141 |
def extract_features(df):
|
| 142 |
-
"""Extract required features from the DataFrame"""
|
| 143 |
required_features = ['Description', 'SPSC', 'MfrID']
|
| 144 |
extracted_features = {}
|
| 145 |
missing_features = []
|
| 146 |
|
| 147 |
for feature in required_features:
|
| 148 |
-
# Try to find a column that contains the feature name (case-insensitive)
|
| 149 |
matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
|
| 150 |
if matching_columns:
|
| 151 |
extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
|
| 152 |
else:
|
| 153 |
missing_features.append(feature)
|
| 154 |
-
extracted_features[feature] = [''] * len(df)
|
| 155 |
|
| 156 |
return extracted_features, missing_features
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
def predict_from_json(json_file, confidence_threshold=0.7):
|
| 168 |
try:
|
| 169 |
-
|
| 170 |
-
content = read_json_file(json_file)
|
| 171 |
data = parse_json_content(content)
|
| 172 |
|
| 173 |
if not data:
|
| 174 |
-
logger.error("No valid JSON objects found in the input")
|
| 175 |
return "No valid JSON objects found in the input. Please check your JSON format."
|
| 176 |
|
| 177 |
-
# Convert to DataFrame
|
| 178 |
df = pd.DataFrame(data)
|
| 179 |
logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
|
| 180 |
|
| 181 |
-
# Extract features
|
| 182 |
extracted_features, missing_features = extract_features(df)
|
| 183 |
|
| 184 |
-
# Warn about missing features
|
| 185 |
if missing_features:
|
| 186 |
logger.warning(f"Missing features: {', '.join(missing_features)}")
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
except Exception as e:
|
| 192 |
-
logger.error(f"Error in preprocessing descriptions: {str(e)}")
|
| 193 |
-
description_features = np.zeros((len(extracted_features['Description']), svd_model.n_components + w2v_model.vector_size))
|
| 194 |
|
| 195 |
-
spsc_array = preprocess_spsc(extracted_features['SPSC'])
|
| 196 |
-
mfrid_array = preprocess_mfrid(extracted_features['MfrID'])
|
| 197 |
-
|
| 198 |
-
# Combine all features
|
| 199 |
X = np.hstack([description_features, spsc_array, mfrid_array])
|
| 200 |
|
| 201 |
-
# Make predictions with probabilities
|
| 202 |
predictions = rf_model.predict(X)
|
| 203 |
probabilities = rf_model.predict_proba(X)
|
| 204 |
max_probabilities = np.max(probabilities, axis=1)
|
| 205 |
|
| 206 |
-
# Add predictions and confidence to the DataFrame
|
| 207 |
df['Prediction'] = predictions
|
| 208 |
df['Confidence'] = max_probabilities
|
| 209 |
df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
|
| 210 |
|
| 211 |
-
# Select columns for output, including only available columns
|
| 212 |
output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
|
| 213 |
available_columns = [col for col in output_columns if col in df.columns]
|
| 214 |
output_df = df[available_columns]
|
| 215 |
|
| 216 |
-
|
| 217 |
-
csv_result = output_df.to_csv(index=False)
|
| 218 |
-
return csv_result
|
| 219 |
|
| 220 |
except Exception as e:
|
| 221 |
logger.error("An error occurred", exc_info=True)
|
| 222 |
error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
|
| 223 |
return f"An error occurred:\n{error_message}"
|
| 224 |
|
| 225 |
-
def read_json_file(json_file):
|
| 226 |
-
if isinstance(json_file, str):
|
| 227 |
-
with open(json_file, 'r') as file:
|
| 228 |
-
return file.read()
|
| 229 |
-
elif hasattr(json_file, 'read'):
|
| 230 |
-
content = json_file.read()
|
| 231 |
-
return content.decode('utf-8') if isinstance(content, bytes) else content
|
| 232 |
-
else:
|
| 233 |
-
raise ValueError("Invalid input type. Expected file path or file-like object.")
|
| 234 |
-
|
| 235 |
-
def parse_json_content(content):
|
| 236 |
-
content = preprocess_json_content(content)
|
| 237 |
-
try:
|
| 238 |
-
data = json.loads(content)
|
| 239 |
-
return [data] if not isinstance(data, list) else data
|
| 240 |
-
except json.JSONDecodeError:
|
| 241 |
-
return parse_json_line_by_line(content)
|
| 242 |
-
|
| 243 |
-
def preprocess_json_content(content):
|
| 244 |
-
content = content.strip()
|
| 245 |
-
if content.startswith('"') and content.endswith('"'):
|
| 246 |
-
content = content[1:-1]
|
| 247 |
-
if not content.startswith('['):
|
| 248 |
-
content = '[' + content
|
| 249 |
-
if not content.endswith(']'):
|
| 250 |
-
content = content + ']'
|
| 251 |
-
content = content.replace(',]', ']').replace(',}', '}')
|
| 252 |
-
return content
|
| 253 |
-
|
| 254 |
-
def parse_json_line_by_line(content):
|
| 255 |
-
data = []
|
| 256 |
-
for line in content.split('\n'):
|
| 257 |
-
line = line.strip()
|
| 258 |
-
if line:
|
| 259 |
-
try:
|
| 260 |
-
if line.startswith(','):
|
| 261 |
-
line = line[1:]
|
| 262 |
-
if line.startswith('[') and line.endswith(']'):
|
| 263 |
-
line = line[1:-1]
|
| 264 |
-
obj = json.loads(line)
|
| 265 |
-
data.append(obj)
|
| 266 |
-
except json.JSONDecodeError:
|
| 267 |
-
logger.warning(f"Failed to parse line: {line}")
|
| 268 |
-
return data
|
| 269 |
-
|
| 270 |
-
def extract_features(df):
|
| 271 |
-
"""Extract required features from the DataFrame"""
|
| 272 |
-
required_features = ['Description', 'SPSC', 'MfrID']
|
| 273 |
-
extracted_features = {}
|
| 274 |
-
missing_features = []
|
| 275 |
-
|
| 276 |
-
for feature in required_features:
|
| 277 |
-
# Try to find a column that contains the feature name (case-insensitive)
|
| 278 |
-
matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
|
| 279 |
-
if matching_columns:
|
| 280 |
-
extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
|
| 281 |
-
else:
|
| 282 |
-
missing_features.append(feature)
|
| 283 |
-
extracted_features[feature] = [''] * len(df) # Add empty placeholder
|
| 284 |
-
|
| 285 |
-
return extracted_features, missing_features
|
| 286 |
-
|
| 287 |
-
def preprocess_descriptions(descriptions):
|
| 288 |
-
try:
|
| 289 |
-
# TF-IDF transformation
|
| 290 |
-
tfidf_matrix = tfidf_vectorizer.transform(descriptions)
|
| 291 |
-
|
| 292 |
-
# SVD transformation
|
| 293 |
-
tfidf_svd = svd_model.transform(tfidf_matrix)
|
| 294 |
-
|
| 295 |
-
# Word2Vec processing
|
| 296 |
-
tokenized_descriptions = [word_tokenize(str(desc).lower()) for desc in descriptions]
|
| 297 |
-
doc_vectors = np.array([get_doc_vector(tokens) for tokens in tokenized_descriptions])
|
| 298 |
-
|
| 299 |
-
# Combine TF-IDF-SVD and Word2Vec features
|
| 300 |
-
return np.hstack([tfidf_svd, doc_vectors])
|
| 301 |
-
except Exception as e:
|
| 302 |
-
logger.error(f"Error in preprocess_descriptions: {str(e)}")
|
| 303 |
-
# Return a zero vector of appropriate size as a fallback
|
| 304 |
-
return np.zeros((len(descriptions), svd_model.n_components + w2v_model.vector_size))
|
| 305 |
-
|
| 306 |
-
def preprocess_spsc(spsc_codes):
|
| 307 |
-
return np.array(binarize_spsc(spsc_codes, spsc_encoder))
|
| 308 |
-
|
| 309 |
-
def preprocess_mfrid(mfrid_list):
|
| 310 |
-
return np.array(encode_mfrid(mfrid_list, mfrid_encoder))
|
| 311 |
-
|
| 312 |
-
def get_doc_vector(doc_tokens):
|
| 313 |
-
try:
|
| 314 |
-
word_vectors = [w2v_model.wv[word] for word in doc_tokens if word in w2v_model.wv]
|
| 315 |
-
return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
|
| 316 |
-
except Exception as e:
|
| 317 |
-
logger.error(f"Error in get_doc_vector: {str(e)}")
|
| 318 |
-
return np.zeros(w2v_model.vector_size)
|
| 319 |
-
|
| 320 |
-
def binarize_spsc(spsc_codes, spsc_encoder):
|
| 321 |
-
out = []
|
| 322 |
-
format_string = spsc_encoder['format_string']
|
| 323 |
-
bits = spsc_encoder['bits']
|
| 324 |
-
for v in spsc_codes:
|
| 325 |
-
try:
|
| 326 |
-
v = int(v) if not pd.isna(v) else 0
|
| 327 |
-
enc = format_string.format(v)
|
| 328 |
-
out.append([int(c) for c in enc])
|
| 329 |
-
except Exception:
|
| 330 |
-
out.append([0] * bits)
|
| 331 |
-
return out
|
| 332 |
-
|
| 333 |
-
def encode_mfrid(mfrid_list, mfrid_encoder):
|
| 334 |
-
mfrid_to_int = mfrid_encoder
|
| 335 |
-
bits = max(mfrid_to_int.values()).bit_length()
|
| 336 |
-
mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
|
| 337 |
-
encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
|
| 338 |
-
return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
|
| 339 |
-
|
| 340 |
# Create the Gradio Interface
|
| 341 |
inputs = [
|
| 342 |
gr.File(label="Upload JSON File", file_types=['.json']),
|
|
@@ -344,12 +181,9 @@ inputs = [
|
|
| 344 |
]
|
| 345 |
outputs = gr.Textbox(label="Predictions (CSV Format)")
|
| 346 |
|
| 347 |
-
# Define title and description for the Gradio interface
|
| 348 |
title = "Camcor Item Prediction Model"
|
| 349 |
-
description = "A machine learning model that predicts item categories based on product descriptions."
|
| 350 |
|
| 351 |
-
# Ensure that the Gradio Interface has title and description
|
| 352 |
demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
|
| 353 |
|
| 354 |
-
|
| 355 |
demo.launch()
|
|
|
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
import logging
|
| 13 |
+
import scipy.sparse
|
|
|
|
| 14 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 15 |
from sklearn.decomposition import TruncatedSVD
|
| 16 |
from nltk.tokenize import word_tokenize
|
| 17 |
from huggingface_hub import hf_hub_download
|
| 18 |
|
| 19 |
# Download the 'punkt' tokenizer
|
| 20 |
+
nltk.download('punkt', quiet=True)
|
| 21 |
|
| 22 |
# Set up logging
|
| 23 |
logging.basicConfig(level=logging.DEBUG)
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
# Get Hugging Face access token from environment variable
|
| 27 |
access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
|
| 28 |
|
| 29 |
# Set the repository IDs
|
| 30 |
+
model_repo_id = 'apoppie/random_forest_model_20241005_202610'
|
| 31 |
|
| 32 |
# Load the trained Random Forest model from Hugging Face Hub
|
| 33 |
model_file = hf_hub_download(
|
| 34 |
repo_id=model_repo_id,
|
| 35 |
+
filename='random_forest_model_20241005_202610.joblib',
|
| 36 |
use_auth_token=access_token,
|
| 37 |
resume_download=True
|
| 38 |
)
|
| 39 |
+
rf_model = joblib.load(model_file)
|
|
|
|
| 40 |
|
| 41 |
+
# Load other necessary files
|
| 42 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 43 |
|
| 44 |
+
mfrid_encoder_path = os.path.join(current_dir, 'mfrid_encoder.pkl')
|
|
|
|
| 45 |
with open(mfrid_encoder_path, 'rb') as f:
|
| 46 |
mfrid_encoder = pickle.load(f)
|
| 47 |
|
| 48 |
+
spsc_encoder_path = os.path.join(current_dir, 'spsc_encoder.pkl')
|
|
|
|
| 49 |
with open(spsc_encoder_path, 'rb') as f:
|
| 50 |
spsc_encoder = pickle.load(f)
|
| 51 |
|
| 52 |
+
tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer.joblib')
|
| 53 |
+
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
|
| 56 |
+
svd_model = joblib.load(svd_model_path)
|
|
|
|
| 57 |
|
| 58 |
def preprocess_json_content(content):
|
|
|
|
|
|
|
|
|
|
| 59 |
content = content.strip()
|
|
|
|
|
|
|
|
|
|
| 60 |
if content.startswith('"') and content.endswith('"'):
|
| 61 |
content = content[1:-1]
|
|
|
|
|
|
|
|
|
|
| 62 |
if not content.startswith('['):
|
| 63 |
content = '[' + content
|
| 64 |
if not content.endswith(']'):
|
| 65 |
content = content + ']'
|
|
|
|
|
|
|
|
|
|
| 66 |
content = re.sub(r',\s*}', '}', content)
|
| 67 |
content = re.sub(r',\s*]', ']', content)
|
|
|
|
|
|
|
| 68 |
return content
|
| 69 |
|
| 70 |
+
def parse_json_content(content):
|
| 71 |
+
content = preprocess_json_content(content)
|
| 72 |
+
try:
|
| 73 |
+
data = json.loads(content)
|
| 74 |
+
return [data] if not isinstance(data, list) else data
|
| 75 |
+
except json.JSONDecodeError:
|
| 76 |
+
return parse_json_line_by_line(content)
|
| 77 |
+
|
| 78 |
def parse_json_line_by_line(content):
|
| 79 |
data = []
|
| 80 |
+
for line in content.split('\n'):
|
| 81 |
line = line.strip()
|
| 82 |
if line:
|
| 83 |
try:
|
|
|
|
| 84 |
if line.startswith(','):
|
| 85 |
line = line[1:]
|
|
|
|
| 86 |
if line.startswith('[') and line.endswith(']'):
|
| 87 |
line = line[1:-1]
|
| 88 |
obj = json.loads(line)
|
| 89 |
data.append(obj)
|
| 90 |
except json.JSONDecodeError:
|
| 91 |
+
logger.warning(f"Failed to parse line: {line}")
|
| 92 |
return data
|
| 93 |
|
| 94 |
def extract_features(df):
|
|
|
|
| 95 |
required_features = ['Description', 'SPSC', 'MfrID']
|
| 96 |
extracted_features = {}
|
| 97 |
missing_features = []
|
| 98 |
|
| 99 |
for feature in required_features:
|
|
|
|
| 100 |
matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
|
| 101 |
if matching_columns:
|
| 102 |
extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
|
| 103 |
else:
|
| 104 |
missing_features.append(feature)
|
| 105 |
+
extracted_features[feature] = [''] * len(df)
|
| 106 |
|
| 107 |
return extracted_features, missing_features
|
| 108 |
|
| 109 |
+
def preprocess_descriptions(descriptions):
|
| 110 |
+
try:
|
| 111 |
+
tfidf_matrix = tfidf_vectorizer.transform(descriptions)
|
| 112 |
+
tfidf_svd = svd_model.transform(tfidf_matrix)
|
| 113 |
+
return tfidf_svd
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Error in preprocess_descriptions: {str(e)}")
|
| 116 |
+
return np.zeros((len(descriptions), svd_model.n_components_))
|
| 117 |
|
| 118 |
+
def binarize_spsc(spsc_codes):
|
| 119 |
+
out = []
|
| 120 |
+
bits = spsc_encoder
|
| 121 |
+
for v in spsc_codes:
|
| 122 |
+
try:
|
| 123 |
+
v = int(v) if not pd.isna(v) else 0
|
| 124 |
+
enc = format(v, f'0{bits}b')
|
| 125 |
+
out.append([int(c) for c in enc])
|
| 126 |
+
except Exception:
|
| 127 |
+
out.append([0] * bits)
|
| 128 |
+
return out
|
| 129 |
+
|
| 130 |
+
def encode_mfrid(mfrid_list):
|
| 131 |
+
bits = max(mfrid_encoder.values()).bit_length()
|
| 132 |
+
mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
|
| 133 |
+
encoded_mfrid = [mfrid_encoder.get(mfrid, mfrid_encoder.get('NaN', 0)) for mfrid in mfrid_list]
|
| 134 |
+
return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
|
| 135 |
|
| 136 |
def predict_from_json(json_file, confidence_threshold=0.7):
|
| 137 |
try:
|
| 138 |
+
content = json_file.read().decode('utf-8') if hasattr(json_file, 'read') else json_file
|
|
|
|
| 139 |
data = parse_json_content(content)
|
| 140 |
|
| 141 |
if not data:
|
|
|
|
| 142 |
return "No valid JSON objects found in the input. Please check your JSON format."
|
| 143 |
|
|
|
|
| 144 |
df = pd.DataFrame(data)
|
| 145 |
logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
|
| 146 |
|
|
|
|
| 147 |
extracted_features, missing_features = extract_features(df)
|
| 148 |
|
|
|
|
| 149 |
if missing_features:
|
| 150 |
logger.warning(f"Missing features: {', '.join(missing_features)}")
|
| 151 |
|
| 152 |
+
description_features = preprocess_descriptions(extracted_features['Description'])
|
| 153 |
+
spsc_array = np.array(binarize_spsc(extracted_features['SPSC']))
|
| 154 |
+
mfrid_array = np.array(encode_mfrid(extracted_features['MfrID']))
|
|
|
|
|
|
|
|
|
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
X = np.hstack([description_features, spsc_array, mfrid_array])
|
| 157 |
|
|
|
|
| 158 |
predictions = rf_model.predict(X)
|
| 159 |
probabilities = rf_model.predict_proba(X)
|
| 160 |
max_probabilities = np.max(probabilities, axis=1)
|
| 161 |
|
|
|
|
| 162 |
df['Prediction'] = predictions
|
| 163 |
df['Confidence'] = max_probabilities
|
| 164 |
df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
|
| 165 |
|
|
|
|
| 166 |
output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
|
| 167 |
available_columns = [col for col in output_columns if col in df.columns]
|
| 168 |
output_df = df[available_columns]
|
| 169 |
|
| 170 |
+
return output_df.to_csv(index=False)
|
|
|
|
|
|
|
| 171 |
|
| 172 |
except Exception as e:
|
| 173 |
logger.error("An error occurred", exc_info=True)
|
| 174 |
error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
|
| 175 |
return f"An error occurred:\n{error_message}"
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
# Create the Gradio Interface
|
| 178 |
inputs = [
|
| 179 |
gr.File(label="Upload JSON File", file_types=['.json']),
|
|
|
|
| 181 |
]
|
| 182 |
outputs = gr.Textbox(label="Predictions (CSV Format)")
|
| 183 |
|
|
|
|
| 184 |
title = "Camcor Item Prediction Model"
|
| 185 |
+
description = "A machine learning model that predicts item categories based on product descriptions, SPSC codes, and Manufacturer IDs."
|
| 186 |
|
|
|
|
| 187 |
demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
|
| 188 |
|
|
|
|
| 189 |
demo.launch()
|
description_engineering_models.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a8c02f0679219d9a502b311fe531c903b1ee92a850aa078f3c0fe803a8319653
|
| 3 |
-
size 192414565
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -3,8 +3,6 @@ pandas
|
|
| 3 |
numpy
|
| 4 |
scikit-learn==1.3.1
|
| 5 |
nltk
|
| 6 |
-
|
| 7 |
-
matplotlib
|
| 8 |
-
seaborn
|
| 9 |
-
tqdm
|
| 10 |
huggingface_hub
|
|
|
|
|
|
| 3 |
numpy
|
| 4 |
scikit-learn==1.3.1
|
| 5 |
nltk
|
| 6 |
+
scipy
|
|
|
|
|
|
|
|
|
|
| 7 |
huggingface_hub
|
| 8 |
+
joblib
|
setup.sh
CHANGED
|
@@ -1,4 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
python -c "import nltk; nltk.download('punkt')"
|
| 3 |
-
|
| 4 |
python -m nltk.downloader punkt
|
|
|
|
| 1 |
+
#!/bin/bash
|
|
|
|
|
|
|
| 2 |
python -m nltk.downloader punkt
|
w2v_model.model.syn1neg.npy → svd_model.joblib
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a0652f61e8da1fbc7cf5a62f3a91cf931524a7a350d576bdd6b6ee28350d260
|
| 3 |
+
size 16010343
|
svd_model.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ef6a3f0eaef3bb823e25f85cf234abe32d95917f754f06e8d1d7c7652f696d6a
|
| 3 |
-
size 12007799
|
|
|
|
|
|
|
|
|
|
|
|
tfidf_vectorizer.pkl → tfidf_vectorizer.joblib
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4df4279bc9d00aaae93a11f3036008a4c90a4cab827d9d80460dde91bf734d41
|
| 3 |
+
size 191789
|
w2v_model.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:80420b2c39799879e6dae5ebbb71fa0b2740b9a8c9ae0699ac52263770ffe090
|
| 3 |
-
size 7298780
|
|
|
|
|
|
|
|
|
|
|
|
w2v_model.model.wv.vectors.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9db84fed90d499bf7baaf423a26ffa04dd3c22c2cb1f62280f09e8709a50e80a
|
| 3 |
-
size 86028128
|
|
|
|
|
|
|
|
|
|
|
|