import gradio as gr
import pickle
import json
import numpy as np
import pandas as pd
import traceback
import nltk
import os
import re
import logging
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from huggingface_hub import hf_hub_download
import joblib
from typing import Tuple, Dict, Any
# Download the 'punkt' tokenizer
nltk.download('punkt', quiet=True)
# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Define current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
# Get Hugging Face access token from environment variable
access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
# Load all necessary files first
def load_models_and_encoders():
try:
# Set the repository IDs
model_repo_id = 'apoppie/random_forest_model_20241005_202610'
# Load the trained Random Forest model from Hugging Face Hub
model_file = hf_hub_download(
repo_id=model_repo_id,
filename='random_forest_model_20241005_145919.pkl',
use_auth_token=access_token,
resume_download=True
)
with open(model_file, 'rb') as f:
rf_model = pickle.load(f)
logger.info("Random Forest model loaded successfully.")
# Load label encoder
label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib')
label_encoder = joblib.load(label_encoder_path)
logger.info("Label encoder loaded successfully.")
# Load MfrID encoder
mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
with open(mfrid_encoder_path, 'rb') as f:
mfrid_encoder = pickle.load(f)
logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}")
# Load SPSC encoder
spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
with open(spsc_encoder_path, 'rb') as f:
spsc_encoder = pickle.load(f)
logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}")
# Load TF-IDF vectorizer
tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib')
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
logger.info("TF-IDF vectorizer loaded successfully.")
# Load SVD model
svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
svd_model = joblib.load(svd_model_path)
logger.info("SVD model loaded successfully.")
return {
'rf_model': rf_model,
'label_encoder': label_encoder,
'mfrid_encoder': mfrid_encoder,
'spsc_encoder': spsc_encoder,
'tfidf_vectorizer': tfidf_vectorizer,
'svd_model': svd_model
}
except Exception as e:
logger.error(f"Error loading models and encoders: {str(e)}")
raise
# Load all models and encoders
models = load_models_and_encoders()
# Assign models to global variables
rf_model = models['rf_model']
label_encoder = models['label_encoder']
mfrid_encoder = models['mfrid_encoder']
spsc_encoder = models['spsc_encoder']
tfidf_vectorizer = models['tfidf_vectorizer']
svd_model = models['svd_model']
# Set up mfrid_to_index
if isinstance(mfrid_encoder, list):
mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)}
elif isinstance(mfrid_encoder, dict):
mfrid_to_index = mfrid_encoder
else:
raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}")
# Custom download link button
def create_download_link(csv_string):
return f'Download CSV'
def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]):
results = {}
try:
for threshold in thresholds:
df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold)
if df is not None:
# Change 'Class' to 'Review' to match DataFrame structure
human_review = (df['Review'] == "Needs Human Review").mean()
results[threshold] = {
'human_review_percentage': round(human_review * 100, 2),
'confident_predictions': round((1 - human_review) * 100, 2)
}
# Create DataFrame with better formatting
results_df = pd.DataFrame(results).T
results_df.index = [f"{int(x*100)}%" for x in results_df.index]
results_df.index.name = "Threshold"
results_df.columns = ["Human Review %", "Confident Predictions %"]
return results_df
except Exception as e:
logger.error(f"Error in threshold testing: {str(e)}")
# Return a formatted error DataFrame
error_df = pd.DataFrame({
'Threshold': [f"{int(x*100)}%" for x in thresholds],
'Error': [str(e)] * len(thresholds)
}).set_index('Threshold')
return error_df
def parse_json_content(content):
try:
data = json.loads(content)
if not isinstance(data, list):
data = [data]
return data, "JSON parsing successful."
except json.JSONDecodeError:
logger.error("Failed to parse JSON content")
return [], "Failed to parse JSON content. Please check the format."
def preprocess_features(df: pd.DataFrame) -> dict:
"""
Preprocess features exactly as done in training
"""
features = {}
# Description processing
features['Description'] = (df['Description']
.fillna('') # Match training null handling
.astype(str)
.tolist())
# SPSC processing
features['SPSC'] = (df['SPSC']
.fillna(0) # Match training null handling
.astype(int) # Convert to int first
.astype(str) # Then to string
.tolist())
# MfrID processing
features['MfrID'] = (df['MfrID']
.fillna('Unknown') # Match training null handling
.astype(str)
.tolist())
return features
def preprocess_descriptions(descriptions):
try:
tfidf_matrix = tfidf_vectorizer.transform(descriptions)
svd_result = svd_model.transform(tfidf_matrix)
validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}"
return svd_result, validation_message
except Exception as e:
logger.error(f"Error in preprocess_descriptions: {str(e)}")
return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}"
def binarize_spsc(spsc_codes):
out = []
if isinstance(spsc_encoder, dict):
bits = spsc_encoder['bits']
elif isinstance(spsc_encoder, int):
bits = spsc_encoder
else:
bits = 32 # Default to 32 bits if we can't determine the correct number
logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.")
for v in spsc_codes:
try:
v = int(v) if not pd.isna(v) else 0
enc = format(v, f'0{bits}b')
out.append([int(c) for c in enc])
except Exception as e:
logger.error(f"Error binarizing SPSC code {v}: {str(e)}")
out.append([0] * bits)
validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}"
return out, validation_message
def encode_mfrid(mfrid_list, mfrid_encoder):
"""
Convert list of MfrID values to binary encoding using the provided encoder
Args:
mfrid_list (list): List of MfrID values to encode
mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding
Returns:
tuple: (encoded_mfrid, validation_message)
"""
encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0
encoded_mfrid = []
for mfrid in mfrid_list:
if pd.isna(mfrid):
encoded_mfrid.append([0] * encoded_length)
else:
mfrid = str(mfrid)
if mfrid in mfrid_encoder:
encoded_mfrid.append(mfrid_encoder[mfrid])
else:
encoded_mfrid.append([0] * encoded_length)
validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}"
return encoded_mfrid, validation_message
def predict_from_json(json_file, confidence_threshold=0.3):
"""
Make predictions from JSON input file with consistent preprocessing
"""
validation_steps = []
try:
# Parse JSON content
if hasattr(json_file, 'read'):
content = json_file.read().decode('utf-8')
else:
with open(json_file, 'r') as f:
content = f.read()
data = json.loads(content)
if not isinstance(data, list):
data = [data]
# Create DataFrame
df = pd.DataFrame(data)
# Use the new centralized preprocessing
features = preprocess_features(df)
validation_steps.append("Features preprocessed consistently with training")
# Process features with existing functions
tfidf_features, desc_msg = preprocess_descriptions(features['Description'])
validation_steps.append(desc_msg)
spsc_features, spsc_msg = binarize_spsc(features['SPSC'])
validation_steps.append(spsc_msg)
mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index)
validation_steps.append(mfrid_msg)
# Combine features
X = np.hstack([
tfidf_features,
np.array(spsc_features),
np.array(mfrid_features)
])
# Make predictions
predictions = rf_model.predict(X)
probabilities = rf_model.predict_proba(X)
max_probabilities = np.max(probabilities, axis=1)
# Create output DataFrame with more detailed information
output_df = pd.DataFrame({
'SKU': df['ID'],
'MfrID': df['MfrID'],
'Predicted_Class': label_encoder.inverse_transform(predictions),
'Review': [
label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold
else "Needs Human Review"
for pred, prob in zip(predictions, max_probabilities)
],
'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities],
'Second_Best_Class': [
label_encoder.inverse_transform([np.argsort(probs)[-2]])[0]
for probs in probabilities
],
'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%"
for probs in probabilities]
})
validation_steps.append("Predictions completed successfully.")
return output_df, output_df.to_csv(index=False), "\n".join(validation_steps)
except Exception as e:
logger.error("An error occurred", exc_info=True)
error_message = f"An error occurred:\n{str(e)}"
validation_steps.append(f"Error: {str(e)}")
return None, error_message, "\n".join(validation_steps)
# Diagnostic code
def analyze_prediction_confidence(model, X, predictions, confidences):
print(f"Number of predictions: {len(predictions)}")
print(f"Average confidence: {np.mean(confidences):.4f}")
print(f"Confidence distribution:")
print(pd.Series(confidences).describe())
# Compare feature distributions
print("\nFeature statistics:")
print(pd.DataFrame(X).describe())
return pd.DataFrame({
'prediction': predictions,
'confidence': confidences
}).sort_values('confidence', ascending=False)
def validate_features(X, expected_shape):
"""Validate feature matrix before prediction"""
if X.shape[1] != expected_shape[1]:
logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}")
return False
# Check for all-zero features
zero_features = np.all(X == 0, axis=0)
if np.any(zero_features):
logger.warning(f"Found {np.sum(zero_features)} all-zero features")
return True
# GRADIO Interface
import gradio as gr
import base64
import pandas as pd
import numpy as np
def create_download_link(csv_string):
csv_bytes = csv_string.encode('utf-8')
b64 = base64.b64encode(csv_bytes).decode()
href = f'data:file/csv;base64,{b64}'
return f'Download Predictions CSV'
def predict_and_display(json_file, confidence_threshold):
try:
if json_file is None:
return None, "", "No file uploaded. Please upload a JSON file."
df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold)
if df is not None:
download_link = create_download_link(csv_string)
return df, download_link, validation_steps
else:
return None, "", f"Error processing predictions: {validation_steps}"
except Exception as e:
logger.error(f"Error in predict_and_display: {str(e)}")
return None, "", f"Error processing predictions: {str(e)}"
def create_diagnostic_output(json_file):
"""Create comprehensive diagnostic information"""
try:
# Test different thresholds
threshold_results = test_different_thresholds(json_file)
# Make predictions with default threshold to get detailed metrics
df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3)
if df is None:
return "Error: Unable to process predictions. Please check the input file format."
# Create detailed report
report = []
report.append("=== Threshold Analysis ===")
report.append("Shows percentage of predictions requiring human review at different confidence thresholds:")
report.append(threshold_results.to_string())
report.append("\n=== Confidence Distribution ===")
conf_scores = df['Confidence Score'].str.rstrip('%').astype(float)
conf_stats = conf_scores.describe()
report.append(conf_stats.to_string())
report.append("\n=== Prediction Distribution ===")
review_dist = df['Review'].value_counts() # Changed from 'Class' to 'Review'
report.append(review_dist.to_string())
report.append("\n=== Validation Steps ===")
report.append(validation_steps)
return "\n".join(report)
except Exception as e:
logger.error(f"Error in diagnostic output: {str(e)}")
return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again."
def build_interface():
try:
with gr.Blocks(title="Camcor Item Prediction Model") as demo:
gr.Markdown("""
# Camcor Item Prediction Model
Upload a JSON file containing product information to get predictions.
""")
with gr.Tabs():
# Predictions Tab
with gr.Tab("Make Predictions"):
# Create a two-column layout
with gr.Row():
# Left column for inputs
with gr.Column(scale=1):
input_file = gr.File(
label="Upload JSON File",
file_types=['.json']
)
confidence = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.3,
step=0.1,
label="Confidence Threshold",
info="Lower values will result in more predictions, higher values in more human review cases"
)
predict_btn = gr.Button("Make Predictions", variant="primary")
# Validation output in left column
validation_output = gr.Textbox(
label="Validation Steps",
lines=3,
max_lines=3,
interactive=False
)
download_link = gr.HTML(
label="Download Predictions",
value=""
)
# Right column for predictions, taking more space
with gr.Column(scale=2):
predictions_df = gr.DataFrame(
headers=[
'SKU',
'MfrID',
'Predicted_Class',
'Confidence CLass',
'Confidence Score',
'Second_Best_Class',
'Second_Best_Score'
],
label="Predictions",
value=None,
wrap=True,
height=600,
interactive=False
)
# Diagnostics Tab
with gr.Tab("Diagnostics"):
with gr.Row():
diagnostic_file = gr.File(
label="Upload JSON for Analysis",
file_types=['.json']
)
analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary")
diagnostic_output = gr.Textbox(
label="Diagnostic Results",
lines=20,
max_lines=30
)
# Help Tab
with gr.Tab("Help"):
gr.Markdown("""
### Using the Prediction Model
1. Upload a JSON file containing items to classify
2. Adjust the confidence threshold if needed
3. Click "Make Predictions" to get results
4. Download results using the provided link
### Understanding the Results
- **SKU**: Unique identifier for the item
- **MfrID**: Manufacturer ID
- **Predicted_Class**: Primary classification prediction
- **Review**: Final classification or "Needs Human Review"
- **Confidence Score**: Confidence level of primary prediction
- **Second_Best_Class**: Alternative classification
- **Second_Best_Score**: Confidence level of alternative
""")
# Updated CSS for better layout and readability
demo.css = """
/* General layout improvements */
.container {
max-width: 100% !important;
padding: 0 !important;
margin: 0 !important;
}
/* DataFrame styling */
.table-wrap {
overflow: visible !important;
max-height: none !important;
border: none !important;
}
table {
width: 100% !important;
border-collapse: collapse !important;
}
th {
background: #f3f4f6 !important;
padding: 12px 8px !important;
text-align: left !important;
font-weight: 600 !important;
color: #374151 !important;
border-bottom: 2px solid #d1d5db !important;
}
td {
padding: 12px 8px !important;
border-bottom: 1px solid #e5e7eb !important;
color: #4b5563 !important;
}
tr:nth-child(even) {
background-color: #f9fafb !important;
}
/* Button styling */
button.primary {
background-color: #2563eb !important;
color: white !important;
padding: 10px 20px !important;
border-radius: 6px !important;
font-weight: 500 !important;
}
/* Validation output styling */
.validation-box {
background-color: #f3f4f6 !important;
border: 1px solid #e5e7eb !important;
border-radius: 6px !important;
padding: 12px !important;
margin: 12px 0 !important;
}
/* Download link styling */
.download-link {
margin: 12px 0 !important;
padding: 8px !important;
text-align: center !important;
background-color: #f3f4f6 !important;
border-radius: 6px !important;
}
/* Tab styling */
.tab-nav {
border-bottom: 2px solid #e5e7eb !important;
padding-bottom: 0 !important;
}
.tab-nav button {
padding: 12px 20px !important;
margin-right: 4px !important;
border-radius: 6px 6px 0 0 !important;
}
.tab-nav button.selected {
background-color: #f3f4f6 !important;
border-bottom: 2px solid #2563eb !important;
}
"""
# Set up event handlers
predict_btn.click(
fn=predict_and_display,
inputs=[input_file, confidence],
outputs=[predictions_df, download_link, validation_output]
)
analyze_btn.click(
create_diagnostic_output,
inputs=[diagnostic_file],
outputs=[diagnostic_output]
)
return demo
except Exception as e:
logger.error(f"Error building interface: {str(e)}")
raise gr.Error(f"Failed to initialize interface: {str(e)}")
def main():
"""Main function to run the Gradio interface"""
try:
demo = build_interface()
# Launch with specific server options
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
share=False
)
except Exception as e:
logger.error(f"Application failed to start: {str(e)}")
raise
if __name__ == "__main__":
main()