import gradio as gr import pickle import json import numpy as np import pandas as pd import traceback import nltk import os import re import logging import scipy.sparse from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from nltk.tokenize import word_tokenize from huggingface_hub import hf_hub_download import joblib from typing import Tuple, Dict, Any # Download the 'punkt' tokenizer nltk.download('punkt', quiet=True) # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # Define current directory current_dir = os.path.dirname(os.path.abspath(__file__)) # Get Hugging Face access token from environment variable access_token = os.getenv('HUGGINGFACE_HUB_TOKEN') # Load all necessary files first def load_models_and_encoders(): try: # Set the repository IDs model_repo_id = 'apoppie/random_forest_model_20241005_202610' # Load the trained Random Forest model from Hugging Face Hub model_file = hf_hub_download( repo_id=model_repo_id, filename='random_forest_model_20241005_145919.pkl', use_auth_token=access_token, resume_download=True ) with open(model_file, 'rb') as f: rf_model = pickle.load(f) logger.info("Random Forest model loaded successfully.") # Load label encoder label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib') label_encoder = joblib.load(label_encoder_path) logger.info("Label encoder loaded successfully.") # Load MfrID encoder mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl') with open(mfrid_encoder_path, 'rb') as f: mfrid_encoder = pickle.load(f) logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}") # Load SPSC encoder spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl') with open(spsc_encoder_path, 'rb') as f: spsc_encoder = pickle.load(f) logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}") # Load TF-IDF vectorizer tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib') tfidf_vectorizer = joblib.load(tfidf_vectorizer_path) logger.info("TF-IDF vectorizer loaded successfully.") # Load SVD model svd_model_path = os.path.join(current_dir, 'svd_model.joblib') svd_model = joblib.load(svd_model_path) logger.info("SVD model loaded successfully.") return { 'rf_model': rf_model, 'label_encoder': label_encoder, 'mfrid_encoder': mfrid_encoder, 'spsc_encoder': spsc_encoder, 'tfidf_vectorizer': tfidf_vectorizer, 'svd_model': svd_model } except Exception as e: logger.error(f"Error loading models and encoders: {str(e)}") raise # Load all models and encoders models = load_models_and_encoders() # Assign models to global variables rf_model = models['rf_model'] label_encoder = models['label_encoder'] mfrid_encoder = models['mfrid_encoder'] spsc_encoder = models['spsc_encoder'] tfidf_vectorizer = models['tfidf_vectorizer'] svd_model = models['svd_model'] # Set up mfrid_to_index if isinstance(mfrid_encoder, list): mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)} elif isinstance(mfrid_encoder, dict): mfrid_to_index = mfrid_encoder else: raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}") # Custom download link button def create_download_link(csv_string): return f'Download CSV' def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]): results = {} try: for threshold in thresholds: df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold) if df is not None: # Change 'Class' to 'Review' to match DataFrame structure human_review = (df['Review'] == "Needs Human Review").mean() results[threshold] = { 'human_review_percentage': round(human_review * 100, 2), 'confident_predictions': round((1 - human_review) * 100, 2) } # Create DataFrame with better formatting results_df = pd.DataFrame(results).T results_df.index = [f"{int(x*100)}%" for x in results_df.index] results_df.index.name = "Threshold" results_df.columns = ["Human Review %", "Confident Predictions %"] return results_df except Exception as e: logger.error(f"Error in threshold testing: {str(e)}") # Return a formatted error DataFrame error_df = pd.DataFrame({ 'Threshold': [f"{int(x*100)}%" for x in thresholds], 'Error': [str(e)] * len(thresholds) }).set_index('Threshold') return error_df def parse_json_content(content): try: data = json.loads(content) if not isinstance(data, list): data = [data] return data, "JSON parsing successful." except json.JSONDecodeError: logger.error("Failed to parse JSON content") return [], "Failed to parse JSON content. Please check the format." def preprocess_features(df: pd.DataFrame) -> dict: """ Preprocess features exactly as done in training """ features = {} # Description processing features['Description'] = (df['Description'] .fillna('') # Match training null handling .astype(str) .tolist()) # SPSC processing features['SPSC'] = (df['SPSC'] .fillna(0) # Match training null handling .astype(int) # Convert to int first .astype(str) # Then to string .tolist()) # MfrID processing features['MfrID'] = (df['MfrID'] .fillna('Unknown') # Match training null handling .astype(str) .tolist()) return features def preprocess_descriptions(descriptions): try: tfidf_matrix = tfidf_vectorizer.transform(descriptions) svd_result = svd_model.transform(tfidf_matrix) validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}" return svd_result, validation_message except Exception as e: logger.error(f"Error in preprocess_descriptions: {str(e)}") return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}" def binarize_spsc(spsc_codes): out = [] if isinstance(spsc_encoder, dict): bits = spsc_encoder['bits'] elif isinstance(spsc_encoder, int): bits = spsc_encoder else: bits = 32 # Default to 32 bits if we can't determine the correct number logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.") for v in spsc_codes: try: v = int(v) if not pd.isna(v) else 0 enc = format(v, f'0{bits}b') out.append([int(c) for c in enc]) except Exception as e: logger.error(f"Error binarizing SPSC code {v}: {str(e)}") out.append([0] * bits) validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}" return out, validation_message def encode_mfrid(mfrid_list, mfrid_encoder): """ Convert list of MfrID values to binary encoding using the provided encoder Args: mfrid_list (list): List of MfrID values to encode mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding Returns: tuple: (encoded_mfrid, validation_message) """ encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0 encoded_mfrid = [] for mfrid in mfrid_list: if pd.isna(mfrid): encoded_mfrid.append([0] * encoded_length) else: mfrid = str(mfrid) if mfrid in mfrid_encoder: encoded_mfrid.append(mfrid_encoder[mfrid]) else: encoded_mfrid.append([0] * encoded_length) validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}" return encoded_mfrid, validation_message def predict_from_json(json_file, confidence_threshold=0.3): """ Make predictions from JSON input file with consistent preprocessing """ validation_steps = [] try: # Parse JSON content if hasattr(json_file, 'read'): content = json_file.read().decode('utf-8') else: with open(json_file, 'r') as f: content = f.read() data = json.loads(content) if not isinstance(data, list): data = [data] # Create DataFrame df = pd.DataFrame(data) # Use the new centralized preprocessing features = preprocess_features(df) validation_steps.append("Features preprocessed consistently with training") # Process features with existing functions tfidf_features, desc_msg = preprocess_descriptions(features['Description']) validation_steps.append(desc_msg) spsc_features, spsc_msg = binarize_spsc(features['SPSC']) validation_steps.append(spsc_msg) mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index) validation_steps.append(mfrid_msg) # Combine features X = np.hstack([ tfidf_features, np.array(spsc_features), np.array(mfrid_features) ]) # Make predictions predictions = rf_model.predict(X) probabilities = rf_model.predict_proba(X) max_probabilities = np.max(probabilities, axis=1) # Create output DataFrame with more detailed information output_df = pd.DataFrame({ 'SKU': df['ID'], 'MfrID': df['MfrID'], 'Predicted_Class': label_encoder.inverse_transform(predictions), 'Review': [ label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold else "Needs Human Review" for pred, prob in zip(predictions, max_probabilities) ], 'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities], 'Second_Best_Class': [ label_encoder.inverse_transform([np.argsort(probs)[-2]])[0] for probs in probabilities ], 'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%" for probs in probabilities] }) validation_steps.append("Predictions completed successfully.") return output_df, output_df.to_csv(index=False), "\n".join(validation_steps) except Exception as e: logger.error("An error occurred", exc_info=True) error_message = f"An error occurred:\n{str(e)}" validation_steps.append(f"Error: {str(e)}") return None, error_message, "\n".join(validation_steps) # Diagnostic code def analyze_prediction_confidence(model, X, predictions, confidences): print(f"Number of predictions: {len(predictions)}") print(f"Average confidence: {np.mean(confidences):.4f}") print(f"Confidence distribution:") print(pd.Series(confidences).describe()) # Compare feature distributions print("\nFeature statistics:") print(pd.DataFrame(X).describe()) return pd.DataFrame({ 'prediction': predictions, 'confidence': confidences }).sort_values('confidence', ascending=False) def validate_features(X, expected_shape): """Validate feature matrix before prediction""" if X.shape[1] != expected_shape[1]: logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}") return False # Check for all-zero features zero_features = np.all(X == 0, axis=0) if np.any(zero_features): logger.warning(f"Found {np.sum(zero_features)} all-zero features") return True # GRADIO Interface import gradio as gr import base64 import pandas as pd import numpy as np def create_download_link(csv_string): csv_bytes = csv_string.encode('utf-8') b64 = base64.b64encode(csv_bytes).decode() href = f'data:file/csv;base64,{b64}' return f'Download Predictions CSV' def predict_and_display(json_file, confidence_threshold): try: if json_file is None: return None, "", "No file uploaded. Please upload a JSON file." df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold) if df is not None: download_link = create_download_link(csv_string) return df, download_link, validation_steps else: return None, "", f"Error processing predictions: {validation_steps}" except Exception as e: logger.error(f"Error in predict_and_display: {str(e)}") return None, "", f"Error processing predictions: {str(e)}" def create_diagnostic_output(json_file): """Create comprehensive diagnostic information""" try: # Test different thresholds threshold_results = test_different_thresholds(json_file) # Make predictions with default threshold to get detailed metrics df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3) if df is None: return "Error: Unable to process predictions. Please check the input file format." # Create detailed report report = [] report.append("=== Threshold Analysis ===") report.append("Shows percentage of predictions requiring human review at different confidence thresholds:") report.append(threshold_results.to_string()) report.append("\n=== Confidence Distribution ===") conf_scores = df['Confidence Score'].str.rstrip('%').astype(float) conf_stats = conf_scores.describe() report.append(conf_stats.to_string()) report.append("\n=== Prediction Distribution ===") review_dist = df['Review'].value_counts() # Changed from 'Class' to 'Review' report.append(review_dist.to_string()) report.append("\n=== Validation Steps ===") report.append(validation_steps) return "\n".join(report) except Exception as e: logger.error(f"Error in diagnostic output: {str(e)}") return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again." def build_interface(): try: with gr.Blocks(title="Camcor Item Prediction Model") as demo: gr.Markdown(""" # Camcor Item Prediction Model Upload a JSON file containing product information to get predictions. """) with gr.Tabs(): # Predictions Tab with gr.Tab("Make Predictions"): # Create a two-column layout with gr.Row(): # Left column for inputs with gr.Column(scale=1): input_file = gr.File( label="Upload JSON File", file_types=['.json'] ) confidence = gr.Slider( minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Confidence Threshold", info="Lower values will result in more predictions, higher values in more human review cases" ) predict_btn = gr.Button("Make Predictions", variant="primary") # Validation output in left column validation_output = gr.Textbox( label="Validation Steps", lines=3, max_lines=3, interactive=False ) download_link = gr.HTML( label="Download Predictions", value="" ) # Right column for predictions, taking more space with gr.Column(scale=2): predictions_df = gr.DataFrame( headers=[ 'SKU', 'MfrID', 'Predicted_Class', 'Confidence CLass', 'Confidence Score', 'Second_Best_Class', 'Second_Best_Score' ], label="Predictions", value=None, wrap=True, height=600, interactive=False ) # Diagnostics Tab with gr.Tab("Diagnostics"): with gr.Row(): diagnostic_file = gr.File( label="Upload JSON for Analysis", file_types=['.json'] ) analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary") diagnostic_output = gr.Textbox( label="Diagnostic Results", lines=20, max_lines=30 ) # Help Tab with gr.Tab("Help"): gr.Markdown(""" ### Using the Prediction Model 1. Upload a JSON file containing items to classify 2. Adjust the confidence threshold if needed 3. Click "Make Predictions" to get results 4. Download results using the provided link ### Understanding the Results - **SKU**: Unique identifier for the item - **MfrID**: Manufacturer ID - **Predicted_Class**: Primary classification prediction - **Review**: Final classification or "Needs Human Review" - **Confidence Score**: Confidence level of primary prediction - **Second_Best_Class**: Alternative classification - **Second_Best_Score**: Confidence level of alternative """) # Updated CSS for better layout and readability demo.css = """ /* General layout improvements */ .container { max-width: 100% !important; padding: 0 !important; margin: 0 !important; } /* DataFrame styling */ .table-wrap { overflow: visible !important; max-height: none !important; border: none !important; } table { width: 100% !important; border-collapse: collapse !important; } th { background: #f3f4f6 !important; padding: 12px 8px !important; text-align: left !important; font-weight: 600 !important; color: #374151 !important; border-bottom: 2px solid #d1d5db !important; } td { padding: 12px 8px !important; border-bottom: 1px solid #e5e7eb !important; color: #4b5563 !important; } tr:nth-child(even) { background-color: #f9fafb !important; } /* Button styling */ button.primary { background-color: #2563eb !important; color: white !important; padding: 10px 20px !important; border-radius: 6px !important; font-weight: 500 !important; } /* Validation output styling */ .validation-box { background-color: #f3f4f6 !important; border: 1px solid #e5e7eb !important; border-radius: 6px !important; padding: 12px !important; margin: 12px 0 !important; } /* Download link styling */ .download-link { margin: 12px 0 !important; padding: 8px !important; text-align: center !important; background-color: #f3f4f6 !important; border-radius: 6px !important; } /* Tab styling */ .tab-nav { border-bottom: 2px solid #e5e7eb !important; padding-bottom: 0 !important; } .tab-nav button { padding: 12px 20px !important; margin-right: 4px !important; border-radius: 6px 6px 0 0 !important; } .tab-nav button.selected { background-color: #f3f4f6 !important; border-bottom: 2px solid #2563eb !important; } """ # Set up event handlers predict_btn.click( fn=predict_and_display, inputs=[input_file, confidence], outputs=[predictions_df, download_link, validation_output] ) analyze_btn.click( create_diagnostic_output, inputs=[diagnostic_file], outputs=[diagnostic_output] ) return demo except Exception as e: logger.error(f"Error building interface: {str(e)}") raise gr.Error(f"Failed to initialize interface: {str(e)}") def main(): """Main function to run the Gradio interface""" try: demo = build_interface() # Launch with specific server options demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, share=False ) except Exception as e: logger.error(f"Application failed to start: {str(e)}") raise if __name__ == "__main__": main()