Spaces:

apoppie
/

camcor_rf

Sleeping

App Files Files Community

PixelPoppie commited on Oct 5, 2024

Commit

9fbbb5c

1 Parent(s): 62ee460

updated model, updated app, only used tf-idf

Browse files

Files changed (11) hide show

MfrID_encoder.pkl +2 -2
SPSC_encoder.pkl +2 -2
app.py +54 -220
description_engineering_models.pkl +0 -3
requirements.txt +2 -4
setup.sh +1 -3
w2v_model.model.syn1neg.npy → svd_model.joblib +2 -2
svd_model.pkl +0 -3
tfidf_vectorizer.pkl → tfidf_vectorizer.joblib +2 -2
w2v_model.model +0 -3
w2v_model.model.wv.vectors.npy +0 -3

MfrID_encoder.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f68f8424f6f855b23273b2e4cdd7c73d1110d951c51bb44909acbaacebf7dafd
-size 2862436

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a687c15c69eeafc5c1145fd198ed9cb06d71c5732b06239c942744df1b15e79
+size 8730949

SPSC_encoder.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d3702df24056d5827adbb6c96b276893af28aa604678945ef0a2edf1bea583d
-size 68

 version https://git-lfs.github.com/spec/v1
+oid sha256:23b27e44a468816577f36b6ee4ee835fb12bdb57bcab2792fd0bae737c1a5f3a
+size 5

app.py CHANGED Viewed

@@ -10,333 +10,170 @@ import nltk
 import os
 import re
 import logging
-import gensim
-import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
 from nltk.tokenize import word_tokenize
 from huggingface_hub import hf_hub_download
 # Download the 'punkt' tokenizer
-nltk.download('punkt')
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
-# Download NLTK data
-nltk.download('punkt')
 # Get Hugging Face access token from environment variable
 access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
 # Set the repository IDs
-model_repo_id = 'apoppie/camcor_rf_20240929_174751'
 # Load the trained Random Forest model from Hugging Face Hub
 model_file = hf_hub_download(
     repo_id=model_repo_id,
-    filename='random_forest_model_20240929_174751.pkl',
     use_auth_token=access_token,
     resume_download=True
 )
-with open(model_file, 'rb') as f:
-    rf_model = pickle.load(f)
-# Assuming encoder and vectorizer .pkl files are in the same directory as app.py
 current_dir = os.path.dirname(os.path.abspath(__file__))
-# Load MfrID encoder from local directory
-mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
 with open(mfrid_encoder_path, 'rb') as f:
     mfrid_encoder = pickle.load(f)
-# Load SPSC encoder from local directory
-spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
 with open(spsc_encoder_path, 'rb') as f:
     spsc_encoder = pickle.load(f)
-# Load SVD model
-with open(os.path.join(current_dir, 'svd_model.pkl'), 'rb') as f:
-    svd_model = pickle.load(f)
-# Load TF-IDF vectorizer
-with open(os.path.join(current_dir, 'tfidf_vectorizer.pkl'), 'rb') as f:
-    tfidf_vectorizer = pickle.load(f)
-# Load Word2Vec model
-w2v_model = gensim.models.Word2Vec.load(os.path.join(current_dir, 'w2v_model.model'))
-def binarize_spsc(spsc_codes, spsc_encoder):
-    """Convert list of SPSC codes to binary values using the provided encoder"""
-    out = []
-    format_string = spsc_encoder['format_string']
-    bits = spsc_encoder['bits']
-    for v in spsc_codes:
-        try:
-            v = int(v) if not pd.isna(v) else 0  # Handle NaN
-            enc = format_string.format(v)
-            out.append([int(c) for c in enc])
-        except Exception:
-            out.append([0] * bits)  # Use zero vector for invalid entries
-    return out
-def encode_mfrid(mfrid_list, mfrid_encoder):
-    """Encode MfrID using the provided mapping and convert to binary"""
-    mfrid_to_int = mfrid_encoder  # Assuming mfrid_encoder is a dict
-    bits = max(mfrid_to_int.values()).bit_length()
-    mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
-    encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
-    binary_encoded = [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
-    return binary_encoded
-def get_doc_vector(doc_tokens):
-    word_vectors = [w2v_model.wv[word] for word in doc_tokens if word in w2v_model.wv]
-    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
 def preprocess_json_content(content):
-    logger.debug("Original content: %s", content[:100])  # Log first 100 characters
-    # Remove any leading/trailing whitespace
     content = content.strip()
-    logger.debug("After stripping: %s", content[:100])
-    # Handle case where content is wrapped in extra quotes
     if content.startswith('"') and content.endswith('"'):
         content = content[1:-1]
-    logger.debug("After removing extra quotes: %s", content[:100])
-    # Ensure content is wrapped in square brackets
     if not content.startswith('['):
         content = '[' + content
     if not content.endswith(']'):
         content = content + ']'
-    logger.debug("After adding brackets: %s", content[:100])
-    # Replace any trailing commas before closing brackets
     content = re.sub(r',\s*}', '}', content)
     content = re.sub(r',\s*]', ']', content)
-    logger.debug("After removing trailing commas: %s", content[:100])
     return content
 def parse_json_line_by_line(content):
     data = []
-    for i, line in enumerate(content.split('\n')):
         line = line.strip()
         if line:
             try:
-                # Handle lines that start with comma
                 if line.startswith(','):
                     line = line[1:]
-                # Handle lines that are wrapped in square brackets
                 if line.startswith('[') and line.endswith(']'):
                     line = line[1:-1]
                 obj = json.loads(line)
                 data.append(obj)
             except json.JSONDecodeError:
-                logger.warning(f"Failed to parse line {i+1}: {line}")
     return data
 def extract_features(df):
-    """Extract required features from the DataFrame"""
     required_features = ['Description', 'SPSC', 'MfrID']
     extracted_features = {}
     missing_features = []
     for feature in required_features:
-        # Try to find a column that contains the feature name (case-insensitive)
         matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
         if matching_columns:
             extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
         else:
             missing_features.append(feature)
-            extracted_features[feature] = [''] * len(df)  # Add empty placeholder
     return extracted_features, missing_features
-import json
-import numpy as np
-import pandas as pd
-import traceback
-import logging
-from nltk.tokenize import word_tokenize
-logger = logging.getLogger(__name__)
 def predict_from_json(json_file, confidence_threshold=0.7):
     try:
-        # Read and parse JSON content
-        content = read_json_file(json_file)
         data = parse_json_content(content)
         if not data:
-            logger.error("No valid JSON objects found in the input")
             return "No valid JSON objects found in the input. Please check your JSON format."
-        # Convert to DataFrame
         df = pd.DataFrame(data)
         logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
-        # Extract features
         extracted_features, missing_features = extract_features(df)
-        # Warn about missing features
         if missing_features:
             logger.warning(f"Missing features: {', '.join(missing_features)}")
-        # Preprocess features
-        try:
-            description_features = preprocess_descriptions(extracted_features['Description'])
-        except Exception as e:
-            logger.error(f"Error in preprocessing descriptions: {str(e)}")
-            description_features = np.zeros((len(extracted_features['Description']), svd_model.n_components + w2v_model.vector_size))
-        spsc_array = preprocess_spsc(extracted_features['SPSC'])
-        mfrid_array = preprocess_mfrid(extracted_features['MfrID'])
-        # Combine all features
         X = np.hstack([description_features, spsc_array, mfrid_array])
-        # Make predictions with probabilities
         predictions = rf_model.predict(X)
         probabilities = rf_model.predict_proba(X)
         max_probabilities = np.max(probabilities, axis=1)
-        # Add predictions and confidence to the DataFrame
         df['Prediction'] = predictions
         df['Confidence'] = max_probabilities
         df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
-        # Select columns for output, including only available columns
         output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
         available_columns = [col for col in output_columns if col in df.columns]
         output_df = df[available_columns]
-        # Return the DataFrame as a CSV string
-        csv_result = output_df.to_csv(index=False)
-        return csv_result
     except Exception as e:
         logger.error("An error occurred", exc_info=True)
         error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
         return f"An error occurred:\n{error_message}"
-def read_json_file(json_file):
-    if isinstance(json_file, str):
-        with open(json_file, 'r') as file:
-            return file.read()
-    elif hasattr(json_file, 'read'):
-        content = json_file.read()
-        return content.decode('utf-8') if isinstance(content, bytes) else content
-    else:
-        raise ValueError("Invalid input type. Expected file path or file-like object.")
-def parse_json_content(content):
-    content = preprocess_json_content(content)
-    try:
-        data = json.loads(content)
-        return [data] if not isinstance(data, list) else data
-    except json.JSONDecodeError:
-        return parse_json_line_by_line(content)
-def preprocess_json_content(content):
-    content = content.strip()
-    if content.startswith('"') and content.endswith('"'):
-        content = content[1:-1]
-    if not content.startswith('['):
-        content = '[' + content
-    if not content.endswith(']'):
-        content = content + ']'
-    content = content.replace(',]', ']').replace(',}', '}')
-    return content
-def parse_json_line_by_line(content):
-    data = []
-    for line in content.split('\n'):
-        line = line.strip()
-        if line:
-            try:
-                if line.startswith(','):
-                    line = line[1:]
-                if line.startswith('[') and line.endswith(']'):
-                    line = line[1:-1]
-                obj = json.loads(line)
-                data.append(obj)
-            except json.JSONDecodeError:
-                logger.warning(f"Failed to parse line: {line}")
-    return data
-def extract_features(df):
-    """Extract required features from the DataFrame"""
-    required_features = ['Description', 'SPSC', 'MfrID']
-    extracted_features = {}
-    missing_features = []
-    for feature in required_features:
-        # Try to find a column that contains the feature name (case-insensitive)
-        matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
-        if matching_columns:
-            extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
-        else:
-            missing_features.append(feature)
-            extracted_features[feature] = [''] * len(df)  # Add empty placeholder
-    return extracted_features, missing_features
-def preprocess_descriptions(descriptions):
-    try:
-        # TF-IDF transformation
-        tfidf_matrix = tfidf_vectorizer.transform(descriptions)
-        # SVD transformation
-        tfidf_svd = svd_model.transform(tfidf_matrix)
-        # Word2Vec processing
-        tokenized_descriptions = [word_tokenize(str(desc).lower()) for desc in descriptions]
-        doc_vectors = np.array([get_doc_vector(tokens) for tokens in tokenized_descriptions])
-        # Combine TF-IDF-SVD and Word2Vec features
-        return np.hstack([tfidf_svd, doc_vectors])
-    except Exception as e:
-        logger.error(f"Error in preprocess_descriptions: {str(e)}")
-        # Return a zero vector of appropriate size as a fallback
-        return np.zeros((len(descriptions), svd_model.n_components + w2v_model.vector_size))
-def preprocess_spsc(spsc_codes):
-    return np.array(binarize_spsc(spsc_codes, spsc_encoder))
-def preprocess_mfrid(mfrid_list):
-    return np.array(encode_mfrid(mfrid_list, mfrid_encoder))
-def get_doc_vector(doc_tokens):
-    try:
-        word_vectors = [w2v_model.wv[word] for word in doc_tokens if word in w2v_model.wv]
-        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
-    except Exception as e:
-        logger.error(f"Error in get_doc_vector: {str(e)}")
-        return np.zeros(w2v_model.vector_size)
-def binarize_spsc(spsc_codes, spsc_encoder):
-    out = []
-    format_string = spsc_encoder['format_string']
-    bits = spsc_encoder['bits']
-    for v in spsc_codes:
-        try:
-            v = int(v) if not pd.isna(v) else 0
-            enc = format_string.format(v)
-            out.append([int(c) for c in enc])
-        except Exception:
-            out.append([0] * bits)
-    return out
-def encode_mfrid(mfrid_list, mfrid_encoder):
-    mfrid_to_int = mfrid_encoder
-    bits = max(mfrid_to_int.values()).bit_length()
-    mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
-    encoded_mfrid = [mfrid_to_int.get(mfrid, mfrid_to_int.get('NaN', 0)) for mfrid in mfrid_list]
-    return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
 # Create the Gradio Interface
 inputs = [
     gr.File(label="Upload JSON File", file_types=['.json']),
@@ -344,12 +181,9 @@ inputs = [
 ]
 outputs = gr.Textbox(label="Predictions (CSV Format)")
-# Define title and description for the Gradio interface
 title = "Camcor Item Prediction Model"
-description = "A machine learning model that predicts item categories based on product descriptions."
-# Ensure that the Gradio Interface has title and description
 demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
 demo.launch()

 import os
 import re
 import logging
+import scipy.sparse
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
 from nltk.tokenize import word_tokenize
 from huggingface_hub import hf_hub_download
 # Download the 'punkt' tokenizer
+nltk.download('punkt', quiet=True)
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 # Get Hugging Face access token from environment variable
 access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
 # Set the repository IDs
+model_repo_id = 'apoppie/random_forest_model_20241005_202610'
 # Load the trained Random Forest model from Hugging Face Hub
 model_file = hf_hub_download(
     repo_id=model_repo_id,
+    filename='random_forest_model_20241005_202610.joblib',
     use_auth_token=access_token,
     resume_download=True
 )
+rf_model = joblib.load(model_file)
+# Load other necessary files
 current_dir = os.path.dirname(os.path.abspath(__file__))
+mfrid_encoder_path = os.path.join(current_dir, 'mfrid_encoder.pkl')
 with open(mfrid_encoder_path, 'rb') as f:
     mfrid_encoder = pickle.load(f)
+spsc_encoder_path = os.path.join(current_dir, 'spsc_encoder.pkl')
 with open(spsc_encoder_path, 'rb') as f:
     spsc_encoder = pickle.load(f)
+tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer.joblib')
+tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
+svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
+svd_model = joblib.load(svd_model_path)
 def preprocess_json_content(content):
     content = content.strip()
     if content.startswith('"') and content.endswith('"'):
         content = content[1:-1]
     if not content.startswith('['):
         content = '[' + content
     if not content.endswith(']'):
         content = content + ']'
     content = re.sub(r',\s*}', '}', content)
     content = re.sub(r',\s*]', ']', content)
     return content
+def parse_json_content(content):
+    content = preprocess_json_content(content)
+    try:
+        data = json.loads(content)
+        return [data] if not isinstance(data, list) else data
+    except json.JSONDecodeError:
+        return parse_json_line_by_line(content)
 def parse_json_line_by_line(content):
     data = []
+    for line in content.split('\n'):
         line = line.strip()
         if line:
             try:
                 if line.startswith(','):
                     line = line[1:]
                 if line.startswith('[') and line.endswith(']'):
                     line = line[1:-1]
                 obj = json.loads(line)
                 data.append(obj)
             except json.JSONDecodeError:
+                logger.warning(f"Failed to parse line: {line}")
     return data
 def extract_features(df):
     required_features = ['Description', 'SPSC', 'MfrID']
     extracted_features = {}
     missing_features = []
     for feature in required_features:
         matching_columns = [col for col in df.columns if feature.lower() in col.lower()]
         if matching_columns:
             extracted_features[feature] = df[matching_columns[0]].fillna('').astype(str).tolist()
         else:
             missing_features.append(feature)
+            extracted_features[feature] = [''] * len(df)
     return extracted_features, missing_features
+def preprocess_descriptions(descriptions):
+    try:
+        tfidf_matrix = tfidf_vectorizer.transform(descriptions)
+        tfidf_svd = svd_model.transform(tfidf_matrix)
+        return tfidf_svd
+    except Exception as e:
+        logger.error(f"Error in preprocess_descriptions: {str(e)}")
+        return np.zeros((len(descriptions), svd_model.n_components_))
+def binarize_spsc(spsc_codes):
+    out = []
+    bits = spsc_encoder
+    for v in spsc_codes:
+        try:
+            v = int(v) if not pd.isna(v) else 0
+            enc = format(v, f'0{bits}b')
+            out.append([int(c) for c in enc])
+        except Exception:
+            out.append([0] * bits)
+    return out
+def encode_mfrid(mfrid_list):
+    bits = max(mfrid_encoder.values()).bit_length()
+    mfrid_list = ['NaN' if pd.isna(x) else str(x) for x in mfrid_list]
+    encoded_mfrid = [mfrid_encoder.get(mfrid, mfrid_encoder.get('NaN', 0)) for mfrid in mfrid_list]
+    return [list(map(int, format(code, f'0{bits}b'))) for code in encoded_mfrid]
 def predict_from_json(json_file, confidence_threshold=0.7):
     try:
+        content = json_file.read().decode('utf-8') if hasattr(json_file, 'read') else json_file
         data = parse_json_content(content)
         if not data:
             return "No valid JSON objects found in the input. Please check your JSON format."
         df = pd.DataFrame(data)
         logger.info(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
         extracted_features, missing_features = extract_features(df)
         if missing_features:
             logger.warning(f"Missing features: {', '.join(missing_features)}")
+        description_features = preprocess_descriptions(extracted_features['Description'])
+        spsc_array = np.array(binarize_spsc(extracted_features['SPSC']))
+        mfrid_array = np.array(encode_mfrid(extracted_features['MfrID']))
         X = np.hstack([description_features, spsc_array, mfrid_array])
         predictions = rf_model.predict(X)
         probabilities = rf_model.predict_proba(X)
         max_probabilities = np.max(probabilities, axis=1)
         df['Prediction'] = predictions
         df['Confidence'] = max_probabilities
         df['Class'] = df.apply(lambda row: row['Prediction'] if row['Confidence'] >= confidence_threshold else "Needs Human Review", axis=1)
         output_columns = ['SKU', 'Description', 'SPSC', 'MfrID', 'Class', 'Confidence']
         available_columns = [col for col in output_columns if col in df.columns]
         output_df = df[available_columns]
+        return output_df.to_csv(index=False)
     except Exception as e:
         logger.error("An error occurred", exc_info=True)
         error_message = ''.join(traceback.format_exception(None, e, e.__traceback__))
         return f"An error occurred:\n{error_message}"
 # Create the Gradio Interface
 inputs = [
     gr.File(label="Upload JSON File", file_types=['.json']),
 ]
 outputs = gr.Textbox(label="Predictions (CSV Format)")
 title = "Camcor Item Prediction Model"
+description = "A machine learning model that predicts item categories based on product descriptions, SPSC codes, and Manufacturer IDs."
 demo = gr.Interface(fn=predict_from_json, inputs=inputs, outputs=outputs, title=title, description=description)
 demo.launch()

description_engineering_models.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8c02f0679219d9a502b311fe531c903b1ee92a850aa078f3c0fe803a8319653
-size 192414565

requirements.txt CHANGED Viewed

@@ -3,8 +3,6 @@ pandas
 numpy
 scikit-learn==1.3.1
 nltk
-gensim
-matplotlib
-seaborn
-tqdm
 huggingface_hub

 numpy
 scikit-learn==1.3.1
 nltk
+scipy
 huggingface_hub
+joblib

setup.sh CHANGED Viewed

@@ -1,4 +1,2 @@
-# setup.sh
-python -c "import nltk; nltk.download('punkt')"
 python -m nltk.downloader punkt


1	+ #!/bin/bash


2	python -m nltk.downloader punkt

w2v_model.model.syn1neg.npy → svd_model.joblib RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b74d6285b82c2b7fb6c501a3bb5179dfc2d37de0befaf1734bf5d9c7624dbe17
-size 86028128

 version https://git-lfs.github.com/spec/v1
+oid sha256:4a0652f61e8da1fbc7cf5a62f3a91cf931524a7a350d576bdd6b6ee28350d260
+size 16010343

svd_model.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef6a3f0eaef3bb823e25f85cf234abe32d95917f754f06e8d1d7c7652f696d6a
-size 12007799

tfidf_vectorizer.pkl → tfidf_vectorizer.joblib RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a95c5adc47bd65250c271f4d3e6b12822c26206fe1997ef1f7e82e30f2f13f27
-size 191654

 version https://git-lfs.github.com/spec/v1
+oid sha256:4df4279bc9d00aaae93a11f3036008a4c90a4cab827d9d80460dde91bf734d41
+size 191789

w2v_model.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:80420b2c39799879e6dae5ebbb71fa0b2740b9a8c9ae0699ac52263770ffe090
-size 7298780

w2v_model.model.wv.vectors.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9db84fed90d499bf7baaf423a26ffa04dd3c22c2cb1f62280f09e8709a50e80a
-size 86028128