camcor_rf / app.py
PixelPoppie
ui update
2a0f308
import gradio as gr
import pickle
import json
import numpy as np
import pandas as pd
import traceback
import nltk
import os
import re
import logging
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from huggingface_hub import hf_hub_download
import joblib
from typing import Tuple, Dict, Any
# Download the 'punkt' tokenizer
nltk.download('punkt', quiet=True)
# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Define current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
# Get Hugging Face access token from environment variable
access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
# Load all necessary files first
def load_models_and_encoders():
try:
# Set the repository IDs
model_repo_id = 'apoppie/random_forest_model_20241005_202610'
# Load the trained Random Forest model from Hugging Face Hub
model_file = hf_hub_download(
repo_id=model_repo_id,
filename='random_forest_model_20241005_145919.pkl',
use_auth_token=access_token,
resume_download=True
)
with open(model_file, 'rb') as f:
rf_model = pickle.load(f)
logger.info("Random Forest model loaded successfully.")
# Load label encoder
label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib')
label_encoder = joblib.load(label_encoder_path)
logger.info("Label encoder loaded successfully.")
# Load MfrID encoder
mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
with open(mfrid_encoder_path, 'rb') as f:
mfrid_encoder = pickle.load(f)
logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}")
# Load SPSC encoder
spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
with open(spsc_encoder_path, 'rb') as f:
spsc_encoder = pickle.load(f)
logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}")
# Load TF-IDF vectorizer
tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib')
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
logger.info("TF-IDF vectorizer loaded successfully.")
# Load SVD model
svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
svd_model = joblib.load(svd_model_path)
logger.info("SVD model loaded successfully.")
return {
'rf_model': rf_model,
'label_encoder': label_encoder,
'mfrid_encoder': mfrid_encoder,
'spsc_encoder': spsc_encoder,
'tfidf_vectorizer': tfidf_vectorizer,
'svd_model': svd_model
}
except Exception as e:
logger.error(f"Error loading models and encoders: {str(e)}")
raise
# Load all models and encoders
models = load_models_and_encoders()
# Assign models to global variables
rf_model = models['rf_model']
label_encoder = models['label_encoder']
mfrid_encoder = models['mfrid_encoder']
spsc_encoder = models['spsc_encoder']
tfidf_vectorizer = models['tfidf_vectorizer']
svd_model = models['svd_model']
# Set up mfrid_to_index
if isinstance(mfrid_encoder, list):
mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)}
elif isinstance(mfrid_encoder, dict):
mfrid_to_index = mfrid_encoder
else:
raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}")
# Custom download link button
def create_download_link(csv_string):
return f'<a href="data:file/csv;base64,{csv_string.encode().decode()}" download="predictions.csv">Download CSV</a>'
def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]):
results = {}
try:
for threshold in thresholds:
df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold)
if df is not None:
# Change 'Class' to 'Review' to match DataFrame structure
human_review = (df['Review'] == "Needs Human Review").mean()
results[threshold] = {
'human_review_percentage': round(human_review * 100, 2),
'confident_predictions': round((1 - human_review) * 100, 2)
}
# Create DataFrame with better formatting
results_df = pd.DataFrame(results).T
results_df.index = [f"{int(x*100)}%" for x in results_df.index]
results_df.index.name = "Threshold"
results_df.columns = ["Human Review %", "Confident Predictions %"]
return results_df
except Exception as e:
logger.error(f"Error in threshold testing: {str(e)}")
# Return a formatted error DataFrame
error_df = pd.DataFrame({
'Threshold': [f"{int(x*100)}%" for x in thresholds],
'Error': [str(e)] * len(thresholds)
}).set_index('Threshold')
return error_df
def parse_json_content(content):
try:
data = json.loads(content)
if not isinstance(data, list):
data = [data]
return data, "JSON parsing successful."
except json.JSONDecodeError:
logger.error("Failed to parse JSON content")
return [], "Failed to parse JSON content. Please check the format."
def preprocess_features(df: pd.DataFrame) -> dict:
"""
Preprocess features exactly as done in training
"""
features = {}
# Description processing
features['Description'] = (df['Description']
.fillna('') # Match training null handling
.astype(str)
.tolist())
# SPSC processing
features['SPSC'] = (df['SPSC']
.fillna(0) # Match training null handling
.astype(int) # Convert to int first
.astype(str) # Then to string
.tolist())
# MfrID processing
features['MfrID'] = (df['MfrID']
.fillna('Unknown') # Match training null handling
.astype(str)
.tolist())
return features
def preprocess_descriptions(descriptions):
try:
tfidf_matrix = tfidf_vectorizer.transform(descriptions)
svd_result = svd_model.transform(tfidf_matrix)
validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}"
return svd_result, validation_message
except Exception as e:
logger.error(f"Error in preprocess_descriptions: {str(e)}")
return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}"
def binarize_spsc(spsc_codes):
out = []
if isinstance(spsc_encoder, dict):
bits = spsc_encoder['bits']
elif isinstance(spsc_encoder, int):
bits = spsc_encoder
else:
bits = 32 # Default to 32 bits if we can't determine the correct number
logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.")
for v in spsc_codes:
try:
v = int(v) if not pd.isna(v) else 0
enc = format(v, f'0{bits}b')
out.append([int(c) for c in enc])
except Exception as e:
logger.error(f"Error binarizing SPSC code {v}: {str(e)}")
out.append([0] * bits)
validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}"
return out, validation_message
def encode_mfrid(mfrid_list, mfrid_encoder):
"""
Convert list of MfrID values to binary encoding using the provided encoder
Args:
mfrid_list (list): List of MfrID values to encode
mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding
Returns:
tuple: (encoded_mfrid, validation_message)
"""
encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0
encoded_mfrid = []
for mfrid in mfrid_list:
if pd.isna(mfrid):
encoded_mfrid.append([0] * encoded_length)
else:
mfrid = str(mfrid)
if mfrid in mfrid_encoder:
encoded_mfrid.append(mfrid_encoder[mfrid])
else:
encoded_mfrid.append([0] * encoded_length)
validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}"
return encoded_mfrid, validation_message
def predict_from_json(json_file, confidence_threshold=0.3):
"""
Make predictions from JSON input file with consistent preprocessing
"""
validation_steps = []
try:
# Parse JSON content
if hasattr(json_file, 'read'):
content = json_file.read().decode('utf-8')
else:
with open(json_file, 'r') as f:
content = f.read()
data = json.loads(content)
if not isinstance(data, list):
data = [data]
# Create DataFrame
df = pd.DataFrame(data)
# Use the new centralized preprocessing
features = preprocess_features(df)
validation_steps.append("Features preprocessed consistently with training")
# Process features with existing functions
tfidf_features, desc_msg = preprocess_descriptions(features['Description'])
validation_steps.append(desc_msg)
spsc_features, spsc_msg = binarize_spsc(features['SPSC'])
validation_steps.append(spsc_msg)
mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index)
validation_steps.append(mfrid_msg)
# Combine features
X = np.hstack([
tfidf_features,
np.array(spsc_features),
np.array(mfrid_features)
])
# Make predictions
predictions = rf_model.predict(X)
probabilities = rf_model.predict_proba(X)
max_probabilities = np.max(probabilities, axis=1)
# Create output DataFrame with more detailed information
output_df = pd.DataFrame({
'SKU': df['ID'],
'MfrID': df['MfrID'],
'Predicted_Class': label_encoder.inverse_transform(predictions),
'Review': [
label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold
else "Needs Human Review"
for pred, prob in zip(predictions, max_probabilities)
],
'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities],
'Second_Best_Class': [
label_encoder.inverse_transform([np.argsort(probs)[-2]])[0]
for probs in probabilities
],
'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%"
for probs in probabilities]
})
validation_steps.append("Predictions completed successfully.")
return output_df, output_df.to_csv(index=False), "\n".join(validation_steps)
except Exception as e:
logger.error("An error occurred", exc_info=True)
error_message = f"An error occurred:\n{str(e)}"
validation_steps.append(f"Error: {str(e)}")
return None, error_message, "\n".join(validation_steps)
# Diagnostic code
def analyze_prediction_confidence(model, X, predictions, confidences):
print(f"Number of predictions: {len(predictions)}")
print(f"Average confidence: {np.mean(confidences):.4f}")
print(f"Confidence distribution:")
print(pd.Series(confidences).describe())
# Compare feature distributions
print("\nFeature statistics:")
print(pd.DataFrame(X).describe())
return pd.DataFrame({
'prediction': predictions,
'confidence': confidences
}).sort_values('confidence', ascending=False)
def validate_features(X, expected_shape):
"""Validate feature matrix before prediction"""
if X.shape[1] != expected_shape[1]:
logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}")
return False
# Check for all-zero features
zero_features = np.all(X == 0, axis=0)
if np.any(zero_features):
logger.warning(f"Found {np.sum(zero_features)} all-zero features")
return True
# GRADIO Interface
import gradio as gr
import base64
import pandas as pd
import numpy as np
def create_download_link(csv_string):
csv_bytes = csv_string.encode('utf-8')
b64 = base64.b64encode(csv_bytes).decode()
href = f'data:file/csv;base64,{b64}'
return f'<a href="{href}" download="predictions.csv" target="_blank">Download Predictions CSV</a>'
def predict_and_display(json_file, confidence_threshold):
try:
if json_file is None:
return None, "", "No file uploaded. Please upload a JSON file."
df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold)
if df is not None:
download_link = create_download_link(csv_string)
return df, download_link, validation_steps
else:
return None, "", f"Error processing predictions: {validation_steps}"
except Exception as e:
logger.error(f"Error in predict_and_display: {str(e)}")
return None, "", f"Error processing predictions: {str(e)}"
def create_diagnostic_output(json_file):
"""Create comprehensive diagnostic information"""
try:
# Test different thresholds
threshold_results = test_different_thresholds(json_file)
# Make predictions with default threshold to get detailed metrics
df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3)
if df is None:
return "Error: Unable to process predictions. Please check the input file format."
# Create detailed report
report = []
report.append("=== Threshold Analysis ===")
report.append("Shows percentage of predictions requiring human review at different confidence thresholds:")
report.append(threshold_results.to_string())
report.append("\n=== Confidence Distribution ===")
conf_scores = df['Confidence Score'].str.rstrip('%').astype(float)
conf_stats = conf_scores.describe()
report.append(conf_stats.to_string())
report.append("\n=== Prediction Distribution ===")
review_dist = df['Review'].value_counts() # Changed from 'Class' to 'Review'
report.append(review_dist.to_string())
report.append("\n=== Validation Steps ===")
report.append(validation_steps)
return "\n".join(report)
except Exception as e:
logger.error(f"Error in diagnostic output: {str(e)}")
return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again."
def build_interface():
try:
with gr.Blocks(title="Camcor Item Prediction Model") as demo:
gr.Markdown("""
# Camcor Item Prediction Model
Upload a JSON file containing product information to get predictions.
""")
with gr.Tabs():
# Predictions Tab
with gr.Tab("Make Predictions"):
# Create a two-column layout
with gr.Row():
# Left column for inputs
with gr.Column(scale=1):
input_file = gr.File(
label="Upload JSON File",
file_types=['.json']
)
confidence = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.3,
step=0.1,
label="Confidence Threshold",
info="Lower values will result in more predictions, higher values in more human review cases"
)
predict_btn = gr.Button("Make Predictions", variant="primary")
# Validation output in left column
validation_output = gr.Textbox(
label="Validation Steps",
lines=3,
max_lines=3,
interactive=False
)
download_link = gr.HTML(
label="Download Predictions",
value=""
)
# Right column for predictions, taking more space
with gr.Column(scale=2):
predictions_df = gr.DataFrame(
headers=[
'SKU',
'MfrID',
'Predicted_Class',
'Confidence CLass',
'Confidence Score',
'Second_Best_Class',
'Second_Best_Score'
],
label="Predictions",
value=None,
wrap=True,
height=600,
interactive=False
)
# Diagnostics Tab
with gr.Tab("Diagnostics"):
with gr.Row():
diagnostic_file = gr.File(
label="Upload JSON for Analysis",
file_types=['.json']
)
analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary")
diagnostic_output = gr.Textbox(
label="Diagnostic Results",
lines=20,
max_lines=30
)
# Help Tab
with gr.Tab("Help"):
gr.Markdown("""
### Using the Prediction Model
1. Upload a JSON file containing items to classify
2. Adjust the confidence threshold if needed
3. Click "Make Predictions" to get results
4. Download results using the provided link
### Understanding the Results
- **SKU**: Unique identifier for the item
- **MfrID**: Manufacturer ID
- **Predicted_Class**: Primary classification prediction
- **Review**: Final classification or "Needs Human Review"
- **Confidence Score**: Confidence level of primary prediction
- **Second_Best_Class**: Alternative classification
- **Second_Best_Score**: Confidence level of alternative
""")
# Updated CSS for better layout and readability
demo.css = """
/* General layout improvements */
.container {
max-width: 100% !important;
padding: 0 !important;
margin: 0 !important;
}
/* DataFrame styling */
.table-wrap {
overflow: visible !important;
max-height: none !important;
border: none !important;
}
table {
width: 100% !important;
border-collapse: collapse !important;
}
th {
background: #f3f4f6 !important;
padding: 12px 8px !important;
text-align: left !important;
font-weight: 600 !important;
color: #374151 !important;
border-bottom: 2px solid #d1d5db !important;
}
td {
padding: 12px 8px !important;
border-bottom: 1px solid #e5e7eb !important;
color: #4b5563 !important;
}
tr:nth-child(even) {
background-color: #f9fafb !important;
}
/* Button styling */
button.primary {
background-color: #2563eb !important;
color: white !important;
padding: 10px 20px !important;
border-radius: 6px !important;
font-weight: 500 !important;
}
/* Validation output styling */
.validation-box {
background-color: #f3f4f6 !important;
border: 1px solid #e5e7eb !important;
border-radius: 6px !important;
padding: 12px !important;
margin: 12px 0 !important;
}
/* Download link styling */
.download-link {
margin: 12px 0 !important;
padding: 8px !important;
text-align: center !important;
background-color: #f3f4f6 !important;
border-radius: 6px !important;
}
/* Tab styling */
.tab-nav {
border-bottom: 2px solid #e5e7eb !important;
padding-bottom: 0 !important;
}
.tab-nav button {
padding: 12px 20px !important;
margin-right: 4px !important;
border-radius: 6px 6px 0 0 !important;
}
.tab-nav button.selected {
background-color: #f3f4f6 !important;
border-bottom: 2px solid #2563eb !important;
}
"""
# Set up event handlers
predict_btn.click(
fn=predict_and_display,
inputs=[input_file, confidence],
outputs=[predictions_df, download_link, validation_output]
)
analyze_btn.click(
create_diagnostic_output,
inputs=[diagnostic_file],
outputs=[diagnostic_output]
)
return demo
except Exception as e:
logger.error(f"Error building interface: {str(e)}")
raise gr.Error(f"Failed to initialize interface: {str(e)}")
def main():
"""Main function to run the Gradio interface"""
try:
demo = build_interface()
# Launch with specific server options
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
share=False
)
except Exception as e:
logger.error(f"Application failed to start: {str(e)}")
raise
if __name__ == "__main__":
main()