| | import gradio as gr |
| | import pickle |
| | import json |
| | import numpy as np |
| | import pandas as pd |
| | import traceback |
| | import nltk |
| | import os |
| | import re |
| | import logging |
| | import scipy.sparse |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.decomposition import TruncatedSVD |
| | from nltk.tokenize import word_tokenize |
| | from huggingface_hub import hf_hub_download |
| | import joblib |
| | from typing import Tuple, Dict, Any |
| |
|
| | |
| | nltk.download('punkt', quiet=True) |
| |
|
| | |
| | logging.basicConfig(level=logging.DEBUG) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | current_dir = os.path.dirname(os.path.abspath(__file__)) |
| |
|
| | |
| | access_token = os.getenv('HUGGINGFACE_HUB_TOKEN') |
| |
|
| | |
| | def load_models_and_encoders(): |
| | try: |
| | |
| | model_repo_id = 'apoppie/random_forest_model_20241005_202610' |
| |
|
| | |
| | model_file = hf_hub_download( |
| | repo_id=model_repo_id, |
| | filename='random_forest_model_20241005_145919.pkl', |
| | use_auth_token=access_token, |
| | resume_download=True |
| | ) |
| | with open(model_file, 'rb') as f: |
| | rf_model = pickle.load(f) |
| | logger.info("Random Forest model loaded successfully.") |
| |
|
| | |
| | label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib') |
| | label_encoder = joblib.load(label_encoder_path) |
| | logger.info("Label encoder loaded successfully.") |
| |
|
| | |
| | mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl') |
| | with open(mfrid_encoder_path, 'rb') as f: |
| | mfrid_encoder = pickle.load(f) |
| | logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}") |
| |
|
| | |
| | spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl') |
| | with open(spsc_encoder_path, 'rb') as f: |
| | spsc_encoder = pickle.load(f) |
| | logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}") |
| |
|
| | |
| | tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib') |
| | tfidf_vectorizer = joblib.load(tfidf_vectorizer_path) |
| | logger.info("TF-IDF vectorizer loaded successfully.") |
| |
|
| | |
| | svd_model_path = os.path.join(current_dir, 'svd_model.joblib') |
| | svd_model = joblib.load(svd_model_path) |
| | logger.info("SVD model loaded successfully.") |
| |
|
| | return { |
| | 'rf_model': rf_model, |
| | 'label_encoder': label_encoder, |
| | 'mfrid_encoder': mfrid_encoder, |
| | 'spsc_encoder': spsc_encoder, |
| | 'tfidf_vectorizer': tfidf_vectorizer, |
| | 'svd_model': svd_model |
| | } |
| | except Exception as e: |
| | logger.error(f"Error loading models and encoders: {str(e)}") |
| | raise |
| |
|
| | |
| | models = load_models_and_encoders() |
| |
|
| | |
| | rf_model = models['rf_model'] |
| | label_encoder = models['label_encoder'] |
| | mfrid_encoder = models['mfrid_encoder'] |
| | spsc_encoder = models['spsc_encoder'] |
| | tfidf_vectorizer = models['tfidf_vectorizer'] |
| | svd_model = models['svd_model'] |
| |
|
| | |
| | if isinstance(mfrid_encoder, list): |
| | mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)} |
| | elif isinstance(mfrid_encoder, dict): |
| | mfrid_to_index = mfrid_encoder |
| | else: |
| | raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}") |
| |
|
| | |
| | def create_download_link(csv_string): |
| | return f'<a href="data:file/csv;base64,{csv_string.encode().decode()}" download="predictions.csv">Download CSV</a>' |
| |
|
| |
|
| | def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]): |
| | results = {} |
| | try: |
| | for threshold in thresholds: |
| | df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold) |
| | if df is not None: |
| | |
| | human_review = (df['Review'] == "Needs Human Review").mean() |
| | results[threshold] = { |
| | 'human_review_percentage': round(human_review * 100, 2), |
| | 'confident_predictions': round((1 - human_review) * 100, 2) |
| | } |
| | |
| | |
| | results_df = pd.DataFrame(results).T |
| | results_df.index = [f"{int(x*100)}%" for x in results_df.index] |
| | results_df.index.name = "Threshold" |
| | results_df.columns = ["Human Review %", "Confident Predictions %"] |
| | |
| | return results_df |
| | except Exception as e: |
| | logger.error(f"Error in threshold testing: {str(e)}") |
| | |
| | error_df = pd.DataFrame({ |
| | 'Threshold': [f"{int(x*100)}%" for x in thresholds], |
| | 'Error': [str(e)] * len(thresholds) |
| | }).set_index('Threshold') |
| | return error_df |
| |
|
| |
|
| | def parse_json_content(content): |
| | try: |
| | data = json.loads(content) |
| | if not isinstance(data, list): |
| | data = [data] |
| | return data, "JSON parsing successful." |
| | except json.JSONDecodeError: |
| | logger.error("Failed to parse JSON content") |
| | return [], "Failed to parse JSON content. Please check the format." |
| |
|
| | def preprocess_features(df: pd.DataFrame) -> dict: |
| | """ |
| | Preprocess features exactly as done in training |
| | """ |
| | features = {} |
| | |
| | |
| | features['Description'] = (df['Description'] |
| | .fillna('') |
| | .astype(str) |
| | .tolist()) |
| | |
| | |
| | features['SPSC'] = (df['SPSC'] |
| | .fillna(0) |
| | .astype(int) |
| | .astype(str) |
| | .tolist()) |
| | |
| | |
| | features['MfrID'] = (df['MfrID'] |
| | .fillna('Unknown') |
| | .astype(str) |
| | .tolist()) |
| | |
| | return features |
| |
|
| | def preprocess_descriptions(descriptions): |
| | try: |
| | tfidf_matrix = tfidf_vectorizer.transform(descriptions) |
| | svd_result = svd_model.transform(tfidf_matrix) |
| | validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}" |
| | return svd_result, validation_message |
| | except Exception as e: |
| | logger.error(f"Error in preprocess_descriptions: {str(e)}") |
| | return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}" |
| |
|
| | def binarize_spsc(spsc_codes): |
| | out = [] |
| | if isinstance(spsc_encoder, dict): |
| | bits = spsc_encoder['bits'] |
| | elif isinstance(spsc_encoder, int): |
| | bits = spsc_encoder |
| | else: |
| | bits = 32 |
| | logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.") |
| |
|
| | for v in spsc_codes: |
| | try: |
| | v = int(v) if not pd.isna(v) else 0 |
| | enc = format(v, f'0{bits}b') |
| | out.append([int(c) for c in enc]) |
| | except Exception as e: |
| | logger.error(f"Error binarizing SPSC code {v}: {str(e)}") |
| | out.append([0] * bits) |
| | |
| | validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}" |
| | return out, validation_message |
| |
|
| | def encode_mfrid(mfrid_list, mfrid_encoder): |
| | """ |
| | Convert list of MfrID values to binary encoding using the provided encoder |
| | |
| | Args: |
| | mfrid_list (list): List of MfrID values to encode |
| | mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding |
| | |
| | Returns: |
| | tuple: (encoded_mfrid, validation_message) |
| | """ |
| | encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0 |
| | encoded_mfrid = [] |
| | |
| | for mfrid in mfrid_list: |
| | if pd.isna(mfrid): |
| | encoded_mfrid.append([0] * encoded_length) |
| | else: |
| | mfrid = str(mfrid) |
| | if mfrid in mfrid_encoder: |
| | encoded_mfrid.append(mfrid_encoder[mfrid]) |
| | else: |
| | encoded_mfrid.append([0] * encoded_length) |
| | |
| | validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}" |
| | return encoded_mfrid, validation_message |
| |
|
| |
|
| | def predict_from_json(json_file, confidence_threshold=0.3): |
| | """ |
| | Make predictions from JSON input file with consistent preprocessing |
| | """ |
| | validation_steps = [] |
| | try: |
| | |
| | if hasattr(json_file, 'read'): |
| | content = json_file.read().decode('utf-8') |
| | else: |
| | with open(json_file, 'r') as f: |
| | content = f.read() |
| | |
| | data = json.loads(content) |
| | if not isinstance(data, list): |
| | data = [data] |
| | |
| | |
| | df = pd.DataFrame(data) |
| | |
| | |
| | features = preprocess_features(df) |
| | validation_steps.append("Features preprocessed consistently with training") |
| |
|
| | |
| | tfidf_features, desc_msg = preprocess_descriptions(features['Description']) |
| | validation_steps.append(desc_msg) |
| |
|
| | spsc_features, spsc_msg = binarize_spsc(features['SPSC']) |
| | validation_steps.append(spsc_msg) |
| |
|
| | mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index) |
| | validation_steps.append(mfrid_msg) |
| |
|
| | |
| | X = np.hstack([ |
| | tfidf_features, |
| | np.array(spsc_features), |
| | np.array(mfrid_features) |
| | ]) |
| | |
| | |
| | predictions = rf_model.predict(X) |
| | probabilities = rf_model.predict_proba(X) |
| | max_probabilities = np.max(probabilities, axis=1) |
| | |
| | |
| | output_df = pd.DataFrame({ |
| | 'SKU': df['ID'], |
| | 'MfrID': df['MfrID'], |
| | 'Predicted_Class': label_encoder.inverse_transform(predictions), |
| | 'Review': [ |
| | label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold |
| | else "Needs Human Review" |
| | for pred, prob in zip(predictions, max_probabilities) |
| | ], |
| | 'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities], |
| | 'Second_Best_Class': [ |
| | label_encoder.inverse_transform([np.argsort(probs)[-2]])[0] |
| | for probs in probabilities |
| | ], |
| | 'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%" |
| | for probs in probabilities] |
| | }) |
| |
|
| | validation_steps.append("Predictions completed successfully.") |
| | return output_df, output_df.to_csv(index=False), "\n".join(validation_steps) |
| |
|
| | except Exception as e: |
| | logger.error("An error occurred", exc_info=True) |
| | error_message = f"An error occurred:\n{str(e)}" |
| | validation_steps.append(f"Error: {str(e)}") |
| | return None, error_message, "\n".join(validation_steps) |
| |
|
| |
|
| | |
| | def analyze_prediction_confidence(model, X, predictions, confidences): |
| | print(f"Number of predictions: {len(predictions)}") |
| | print(f"Average confidence: {np.mean(confidences):.4f}") |
| | print(f"Confidence distribution:") |
| | print(pd.Series(confidences).describe()) |
| | |
| | |
| | print("\nFeature statistics:") |
| | print(pd.DataFrame(X).describe()) |
| | |
| | return pd.DataFrame({ |
| | 'prediction': predictions, |
| | 'confidence': confidences |
| | }).sort_values('confidence', ascending=False) |
| |
|
| | def validate_features(X, expected_shape): |
| | """Validate feature matrix before prediction""" |
| | if X.shape[1] != expected_shape[1]: |
| | logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}") |
| | return False |
| | |
| | |
| | zero_features = np.all(X == 0, axis=0) |
| | if np.any(zero_features): |
| | logger.warning(f"Found {np.sum(zero_features)} all-zero features") |
| | |
| | return True |
| |
|
| | |
| | import gradio as gr |
| | import base64 |
| | import pandas as pd |
| | import numpy as np |
| |
|
| | def create_download_link(csv_string): |
| | csv_bytes = csv_string.encode('utf-8') |
| | b64 = base64.b64encode(csv_bytes).decode() |
| | href = f'data:file/csv;base64,{b64}' |
| | return f'<a href="{href}" download="predictions.csv" target="_blank">Download Predictions CSV</a>' |
| |
|
| | def predict_and_display(json_file, confidence_threshold): |
| | try: |
| | if json_file is None: |
| | return None, "", "No file uploaded. Please upload a JSON file." |
| | |
| | df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold) |
| | if df is not None: |
| | download_link = create_download_link(csv_string) |
| | return df, download_link, validation_steps |
| | else: |
| | return None, "", f"Error processing predictions: {validation_steps}" |
| | except Exception as e: |
| | logger.error(f"Error in predict_and_display: {str(e)}") |
| | return None, "", f"Error processing predictions: {str(e)}" |
| |
|
| | def create_diagnostic_output(json_file): |
| | """Create comprehensive diagnostic information""" |
| | try: |
| | |
| | threshold_results = test_different_thresholds(json_file) |
| | |
| | |
| | df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3) |
| | |
| | if df is None: |
| | return "Error: Unable to process predictions. Please check the input file format." |
| | |
| | |
| | report = [] |
| | report.append("=== Threshold Analysis ===") |
| | report.append("Shows percentage of predictions requiring human review at different confidence thresholds:") |
| | report.append(threshold_results.to_string()) |
| | |
| | report.append("\n=== Confidence Distribution ===") |
| | conf_scores = df['Confidence Score'].str.rstrip('%').astype(float) |
| | conf_stats = conf_scores.describe() |
| | report.append(conf_stats.to_string()) |
| | |
| | report.append("\n=== Prediction Distribution ===") |
| | review_dist = df['Review'].value_counts() |
| | report.append(review_dist.to_string()) |
| | |
| | report.append("\n=== Validation Steps ===") |
| | report.append(validation_steps) |
| | |
| | return "\n".join(report) |
| | except Exception as e: |
| | logger.error(f"Error in diagnostic output: {str(e)}") |
| | return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again." |
| |
|
| | def build_interface(): |
| | try: |
| | with gr.Blocks(title="Camcor Item Prediction Model") as demo: |
| | gr.Markdown(""" |
| | # Camcor Item Prediction Model |
| | Upload a JSON file containing product information to get predictions. |
| | """) |
| | |
| | with gr.Tabs(): |
| | |
| | with gr.Tab("Make Predictions"): |
| | |
| | with gr.Row(): |
| | |
| | with gr.Column(scale=1): |
| | input_file = gr.File( |
| | label="Upload JSON File", |
| | file_types=['.json'] |
| | ) |
| | confidence = gr.Slider( |
| | minimum=0.1, |
| | maximum=1.0, |
| | value=0.3, |
| | step=0.1, |
| | label="Confidence Threshold", |
| | info="Lower values will result in more predictions, higher values in more human review cases" |
| | ) |
| | predict_btn = gr.Button("Make Predictions", variant="primary") |
| | |
| | |
| | validation_output = gr.Textbox( |
| | label="Validation Steps", |
| | lines=3, |
| | max_lines=3, |
| | interactive=False |
| | ) |
| | download_link = gr.HTML( |
| | label="Download Predictions", |
| | value="" |
| | ) |
| | |
| | |
| | with gr.Column(scale=2): |
| | predictions_df = gr.DataFrame( |
| | headers=[ |
| | 'SKU', |
| | 'MfrID', |
| | 'Predicted_Class', |
| | 'Confidence CLass', |
| | 'Confidence Score', |
| | 'Second_Best_Class', |
| | 'Second_Best_Score' |
| | ], |
| | label="Predictions", |
| | value=None, |
| | wrap=True, |
| | height=600, |
| | interactive=False |
| | ) |
| |
|
| | |
| | with gr.Tab("Diagnostics"): |
| | with gr.Row(): |
| | diagnostic_file = gr.File( |
| | label="Upload JSON for Analysis", |
| | file_types=['.json'] |
| | ) |
| | analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary") |
| | |
| | diagnostic_output = gr.Textbox( |
| | label="Diagnostic Results", |
| | lines=20, |
| | max_lines=30 |
| | ) |
| | |
| | |
| | with gr.Tab("Help"): |
| | gr.Markdown(""" |
| | ### Using the Prediction Model |
| | 1. Upload a JSON file containing items to classify |
| | 2. Adjust the confidence threshold if needed |
| | 3. Click "Make Predictions" to get results |
| | 4. Download results using the provided link |
| | |
| | ### Understanding the Results |
| | - **SKU**: Unique identifier for the item |
| | - **MfrID**: Manufacturer ID |
| | - **Predicted_Class**: Primary classification prediction |
| | - **Review**: Final classification or "Needs Human Review" |
| | - **Confidence Score**: Confidence level of primary prediction |
| | - **Second_Best_Class**: Alternative classification |
| | - **Second_Best_Score**: Confidence level of alternative |
| | """) |
| |
|
| | |
| | demo.css = """ |
| | /* General layout improvements */ |
| | .container { |
| | max-width: 100% !important; |
| | padding: 0 !important; |
| | margin: 0 !important; |
| | } |
| | |
| | /* DataFrame styling */ |
| | .table-wrap { |
| | overflow: visible !important; |
| | max-height: none !important; |
| | border: none !important; |
| | } |
| | |
| | table { |
| | width: 100% !important; |
| | border-collapse: collapse !important; |
| | } |
| | |
| | th { |
| | background: #f3f4f6 !important; |
| | padding: 12px 8px !important; |
| | text-align: left !important; |
| | font-weight: 600 !important; |
| | color: #374151 !important; |
| | border-bottom: 2px solid #d1d5db !important; |
| | } |
| | |
| | td { |
| | padding: 12px 8px !important; |
| | border-bottom: 1px solid #e5e7eb !important; |
| | color: #4b5563 !important; |
| | } |
| | |
| | tr:nth-child(even) { |
| | background-color: #f9fafb !important; |
| | } |
| | |
| | /* Button styling */ |
| | button.primary { |
| | background-color: #2563eb !important; |
| | color: white !important; |
| | padding: 10px 20px !important; |
| | border-radius: 6px !important; |
| | font-weight: 500 !important; |
| | } |
| | |
| | /* Validation output styling */ |
| | .validation-box { |
| | background-color: #f3f4f6 !important; |
| | border: 1px solid #e5e7eb !important; |
| | border-radius: 6px !important; |
| | padding: 12px !important; |
| | margin: 12px 0 !important; |
| | } |
| | |
| | /* Download link styling */ |
| | .download-link { |
| | margin: 12px 0 !important; |
| | padding: 8px !important; |
| | text-align: center !important; |
| | background-color: #f3f4f6 !important; |
| | border-radius: 6px !important; |
| | } |
| | |
| | /* Tab styling */ |
| | .tab-nav { |
| | border-bottom: 2px solid #e5e7eb !important; |
| | padding-bottom: 0 !important; |
| | } |
| | |
| | .tab-nav button { |
| | padding: 12px 20px !important; |
| | margin-right: 4px !important; |
| | border-radius: 6px 6px 0 0 !important; |
| | } |
| | |
| | .tab-nav button.selected { |
| | background-color: #f3f4f6 !important; |
| | border-bottom: 2px solid #2563eb !important; |
| | } |
| | """ |
| |
|
| | |
| | predict_btn.click( |
| | fn=predict_and_display, |
| | inputs=[input_file, confidence], |
| | outputs=[predictions_df, download_link, validation_output] |
| | ) |
| | |
| | analyze_btn.click( |
| | create_diagnostic_output, |
| | inputs=[diagnostic_file], |
| | outputs=[diagnostic_output] |
| | ) |
| |
|
| | return demo |
| |
|
| | except Exception as e: |
| | logger.error(f"Error building interface: {str(e)}") |
| | raise gr.Error(f"Failed to initialize interface: {str(e)}") |
| |
|
| | def main(): |
| | """Main function to run the Gradio interface""" |
| | try: |
| | demo = build_interface() |
| | |
| | demo.launch( |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | show_error=True, |
| | share=False |
| | ) |
| | except Exception as e: |
| | logger.error(f"Application failed to start: {str(e)}") |
| | raise |
| |
|
| | if __name__ == "__main__": |
| | main() |