Spaces:

apoppie
/

camcor_rf

Sleeping

PixelPoppie

ui update

2a0f308 over 1 year ago

23.8 kB

	import gradio as gr
	import pickle
	import json
	import numpy as np
	import pandas as pd
	import traceback
	import nltk
	import os
	import re
	import logging
	import scipy.sparse
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import TruncatedSVD
	from nltk.tokenize import word_tokenize
	from huggingface_hub import hf_hub_download
	import joblib
	from typing import Tuple, Dict, Any

	# Download the 'punkt' tokenizer
	nltk.download('punkt', quiet=True)

	# Set up logging
	logging.basicConfig(level=logging.DEBUG)
	logger = logging.getLogger(__name__)

	# Define current directory
	current_dir = os.path.dirname(os.path.abspath(__file__))

	# Get Hugging Face access token from environment variable
	access_token = os.getenv('HUGGINGFACE_HUB_TOKEN')

	# Load all necessary files first
	def load_models_and_encoders():
	try:
	# Set the repository IDs
	model_repo_id = 'apoppie/random_forest_model_20241005_202610'

	# Load the trained Random Forest model from Hugging Face Hub
	model_file = hf_hub_download(
	repo_id=model_repo_id,
	filename='random_forest_model_20241005_145919.pkl',
	use_auth_token=access_token,
	resume_download=True
	)
	with open(model_file, 'rb') as f:
	rf_model = pickle.load(f)
	logger.info("Random Forest model loaded successfully.")

	# Load label encoder
	label_encoder_path = os.path.join(current_dir, 'label_encoder.joblib')
	label_encoder = joblib.load(label_encoder_path)
	logger.info("Label encoder loaded successfully.")

	# Load MfrID encoder
	mfrid_encoder_path = os.path.join(current_dir, 'MfrID_encoder.pkl')
	with open(mfrid_encoder_path, 'rb') as f:
	mfrid_encoder = pickle.load(f)
	logger.info(f"MfrID encoder loaded. Type: {type(mfrid_encoder)}, Size: {len(mfrid_encoder)}")

	# Load SPSC encoder
	spsc_encoder_path = os.path.join(current_dir, 'SPSC_encoder.pkl')
	with open(spsc_encoder_path, 'rb') as f:
	spsc_encoder = pickle.load(f)
	logger.info(f"SPSC encoder loaded. Type: {type(spsc_encoder)}")

	# Load TF-IDF vectorizer
	tfidf_vectorizer_path = os.path.join(current_dir, 'tfidf_vectorizer_20241006_123542.joblib')
	tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
	logger.info("TF-IDF vectorizer loaded successfully.")

	# Load SVD model
	svd_model_path = os.path.join(current_dir, 'svd_model.joblib')
	svd_model = joblib.load(svd_model_path)
	logger.info("SVD model loaded successfully.")

	return {
	'rf_model': rf_model,
	'label_encoder': label_encoder,
	'mfrid_encoder': mfrid_encoder,
	'spsc_encoder': spsc_encoder,
	'tfidf_vectorizer': tfidf_vectorizer,
	'svd_model': svd_model
	}
	except Exception as e:
	logger.error(f"Error loading models and encoders: {str(e)}")
	raise

	# Load all models and encoders
	models = load_models_and_encoders()

	# Assign models to global variables
	rf_model = models['rf_model']
	label_encoder = models['label_encoder']
	mfrid_encoder = models['mfrid_encoder']
	spsc_encoder = models['spsc_encoder']
	tfidf_vectorizer = models['tfidf_vectorizer']
	svd_model = models['svd_model']

	# Set up mfrid_to_index
	if isinstance(mfrid_encoder, list):
	mfrid_to_index = {mfrid: i for i, mfrid in enumerate(mfrid_encoder)}
	elif isinstance(mfrid_encoder, dict):
	mfrid_to_index = mfrid_encoder
	else:
	raise ValueError(f"Unexpected type for MfrID encoder: {type(mfrid_encoder)}")

	# Custom download link button
	def create_download_link(csv_string):
	return f'<a href="data:file/csv;base64,{csv_string.encode().decode()}" download="predictions.csv">Download CSV</a>'


	def test_different_thresholds(json_file, thresholds=[0.2, 0.3, 0.4, 0.5]):
	results = {}
	try:
	for threshold in thresholds:
	df, _, validation_msg = predict_from_json(json_file, confidence_threshold=threshold)
	if df is not None:
	# Change 'Class' to 'Review' to match DataFrame structure
	human_review = (df['Review'] == "Needs Human Review").mean()
	results[threshold] = {
	'human_review_percentage': round(human_review * 100, 2),
	'confident_predictions': round((1 - human_review) * 100, 2)
	}

	# Create DataFrame with better formatting
	results_df = pd.DataFrame(results).T
	results_df.index = [f"{int(x*100)}%" for x in results_df.index]
	results_df.index.name = "Threshold"
	results_df.columns = ["Human Review %", "Confident Predictions %"]

	return results_df
	except Exception as e:
	logger.error(f"Error in threshold testing: {str(e)}")
	# Return a formatted error DataFrame
	error_df = pd.DataFrame({
	'Threshold': [f"{int(x*100)}%" for x in thresholds],
	'Error': [str(e)] * len(thresholds)
	}).set_index('Threshold')
	return error_df


	def parse_json_content(content):
	try:
	data = json.loads(content)
	if not isinstance(data, list):
	data = [data]
	return data, "JSON parsing successful."
	except json.JSONDecodeError:
	logger.error("Failed to parse JSON content")
	return [], "Failed to parse JSON content. Please check the format."

	def preprocess_features(df: pd.DataFrame) -> dict:
	"""
	Preprocess features exactly as done in training
	"""
	features = {}

	# Description processing
	features['Description'] = (df['Description']
	.fillna('') # Match training null handling
	.astype(str)
	.tolist())

	# SPSC processing
	features['SPSC'] = (df['SPSC']
	.fillna(0) # Match training null handling
	.astype(int) # Convert to int first
	.astype(str) # Then to string
	.tolist())

	# MfrID processing
	features['MfrID'] = (df['MfrID']
	.fillna('Unknown') # Match training null handling
	.astype(str)
	.tolist())

	return features

	def preprocess_descriptions(descriptions):
	try:
	tfidf_matrix = tfidf_vectorizer.transform(descriptions)
	svd_result = svd_model.transform(tfidf_matrix)
	validation_message = f"Descriptions preprocessed. Shape: {svd_result.shape}"
	return svd_result, validation_message
	except Exception as e:
	logger.error(f"Error in preprocess_descriptions: {str(e)}")
	return np.zeros((len(descriptions), svd_model.n_components)), f"Error in description preprocessing: {str(e)}"

	def binarize_spsc(spsc_codes):
	out = []
	if isinstance(spsc_encoder, dict):
	bits = spsc_encoder['bits']
	elif isinstance(spsc_encoder, int):
	bits = spsc_encoder
	else:
	bits = 32 # Default to 32 bits if we can't determine the correct number
	logger.warning(f"Unexpected type for SPSC encoder: {type(spsc_encoder)}. Using default 32 bits.")

	for v in spsc_codes:
	try:
	v = int(v) if not pd.isna(v) else 0
	enc = format(v, f'0{bits}b')
	out.append([int(c) for c in enc])
	except Exception as e:
	logger.error(f"Error binarizing SPSC code {v}: {str(e)}")
	out.append([0] * bits)

	validation_message = f"SPSC codes binarized. Shape: {np.array(out).shape}"
	return out, validation_message

	def encode_mfrid(mfrid_list, mfrid_encoder):
	"""
	Convert list of MfrID values to binary encoding using the provided encoder

	Args:
	mfrid_list (list): List of MfrID values to encode
	mfrid_encoder (dict): Dictionary mapping MfrID to binary encoding

	Returns:
	tuple: (encoded_mfrid, validation_message)
	"""
	encoded_length = len(next(iter(mfrid_encoder.values()))) if mfrid_encoder else 0
	encoded_mfrid = []

	for mfrid in mfrid_list:
	if pd.isna(mfrid):
	encoded_mfrid.append([0] * encoded_length)
	else:
	mfrid = str(mfrid)
	if mfrid in mfrid_encoder:
	encoded_mfrid.append(mfrid_encoder[mfrid])
	else:
	encoded_mfrid.append([0] * encoded_length)

	validation_message = f"MfrID encoded. Shape: {np.array(encoded_mfrid).shape}"
	return encoded_mfrid, validation_message


	def predict_from_json(json_file, confidence_threshold=0.3):
	"""
	Make predictions from JSON input file with consistent preprocessing
	"""
	validation_steps = []
	try:
	# Parse JSON content
	if hasattr(json_file, 'read'):
	content = json_file.read().decode('utf-8')
	else:
	with open(json_file, 'r') as f:
	content = f.read()

	data = json.loads(content)
	if not isinstance(data, list):
	data = [data]

	# Create DataFrame
	df = pd.DataFrame(data)

	# Use the new centralized preprocessing
	features = preprocess_features(df)
	validation_steps.append("Features preprocessed consistently with training")

	# Process features with existing functions
	tfidf_features, desc_msg = preprocess_descriptions(features['Description'])
	validation_steps.append(desc_msg)

	spsc_features, spsc_msg = binarize_spsc(features['SPSC'])
	validation_steps.append(spsc_msg)

	mfrid_features, mfrid_msg = encode_mfrid(features['MfrID'], mfrid_to_index)
	validation_steps.append(mfrid_msg)

	# Combine features
	X = np.hstack([
	tfidf_features,
	np.array(spsc_features),
	np.array(mfrid_features)
	])

	# Make predictions
	predictions = rf_model.predict(X)
	probabilities = rf_model.predict_proba(X)
	max_probabilities = np.max(probabilities, axis=1)

	# Create output DataFrame with more detailed information
	output_df = pd.DataFrame({
	'SKU': df['ID'],
	'MfrID': df['MfrID'],
	'Predicted_Class': label_encoder.inverse_transform(predictions),
	'Review': [
	label_encoder.inverse_transform([pred])[0] if prob >= confidence_threshold
	else "Needs Human Review"
	for pred, prob in zip(predictions, max_probabilities)
	],
	'Confidence Score': [f"{prob*100:.2f}%" for prob in max_probabilities],
	'Second_Best_Class': [
	label_encoder.inverse_transform([np.argsort(probs)[-2]])[0]
	for probs in probabilities
	],
	'Second_Best_Score': [f"{probs[np.argsort(probs)[-2]]*100:.2f}%"
	for probs in probabilities]
	})

	validation_steps.append("Predictions completed successfully.")
	return output_df, output_df.to_csv(index=False), "\n".join(validation_steps)

	except Exception as e:
	logger.error("An error occurred", exc_info=True)
	error_message = f"An error occurred:\n{str(e)}"
	validation_steps.append(f"Error: {str(e)}")
	return None, error_message, "\n".join(validation_steps)


	# Diagnostic code
	def analyze_prediction_confidence(model, X, predictions, confidences):
	print(f"Number of predictions: {len(predictions)}")
	print(f"Average confidence: {np.mean(confidences):.4f}")
	print(f"Confidence distribution:")
	print(pd.Series(confidences).describe())

	# Compare feature distributions
	print("\nFeature statistics:")
	print(pd.DataFrame(X).describe())

	return pd.DataFrame({
	'prediction': predictions,
	'confidence': confidences
	}).sort_values('confidence', ascending=False)

	def validate_features(X, expected_shape):
	"""Validate feature matrix before prediction"""
	if X.shape[1] != expected_shape[1]:
	logger.error(f"Feature mismatch: Expected {expected_shape[1]}, got {X.shape[1]}")
	return False

	# Check for all-zero features
	zero_features = np.all(X == 0, axis=0)
	if np.any(zero_features):
	logger.warning(f"Found {np.sum(zero_features)} all-zero features")

	return True

	# GRADIO Interface
	import gradio as gr
	import base64
	import pandas as pd
	import numpy as np

	def create_download_link(csv_string):
	csv_bytes = csv_string.encode('utf-8')
	b64 = base64.b64encode(csv_bytes).decode()
	href = f'data:file/csv;base64,{b64}'
	return f'<a href="{href}" download="predictions.csv" target="_blank">Download Predictions CSV</a>'

	def predict_and_display(json_file, confidence_threshold):
	try:
	if json_file is None:
	return None, "", "No file uploaded. Please upload a JSON file."

	df, csv_string, validation_steps = predict_from_json(json_file, confidence_threshold)
	if df is not None:
	download_link = create_download_link(csv_string)
	return df, download_link, validation_steps
	else:
	return None, "", f"Error processing predictions: {validation_steps}"
	except Exception as e:
	logger.error(f"Error in predict_and_display: {str(e)}")
	return None, "", f"Error processing predictions: {str(e)}"

	def create_diagnostic_output(json_file):
	"""Create comprehensive diagnostic information"""
	try:
	# Test different thresholds
	threshold_results = test_different_thresholds(json_file)

	# Make predictions with default threshold to get detailed metrics
	df, _, validation_steps = predict_from_json(json_file, confidence_threshold=0.3)

	if df is None:
	return "Error: Unable to process predictions. Please check the input file format."

	# Create detailed report
	report = []
	report.append("=== Threshold Analysis ===")
	report.append("Shows percentage of predictions requiring human review at different confidence thresholds:")
	report.append(threshold_results.to_string())

	report.append("\n=== Confidence Distribution ===")
	conf_scores = df['Confidence Score'].str.rstrip('%').astype(float)
	conf_stats = conf_scores.describe()
	report.append(conf_stats.to_string())

	report.append("\n=== Prediction Distribution ===")
	review_dist = df['Review'].value_counts() # Changed from 'Class' to 'Review'
	report.append(review_dist.to_string())

	report.append("\n=== Validation Steps ===")
	report.append(validation_steps)

	return "\n".join(report)
	except Exception as e:
	logger.error(f"Error in diagnostic output: {str(e)}")
	return f"Error generating diagnostic report: {str(e)}\n\nPlease check the input file format and try again."

	def build_interface():
	try:
	with gr.Blocks(title="Camcor Item Prediction Model") as demo:
	gr.Markdown("""
	# Camcor Item Prediction Model
	Upload a JSON file containing product information to get predictions.
	""")

	with gr.Tabs():
	# Predictions Tab
	with gr.Tab("Make Predictions"):
	# Create a two-column layout
	with gr.Row():
	# Left column for inputs
	with gr.Column(scale=1):
	input_file = gr.File(
	label="Upload JSON File",
	file_types=['.json']
	)
	confidence = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.3,
	step=0.1,
	label="Confidence Threshold",
	info="Lower values will result in more predictions, higher values in more human review cases"
	)
	predict_btn = gr.Button("Make Predictions", variant="primary")

	# Validation output in left column
	validation_output = gr.Textbox(
	label="Validation Steps",
	lines=3,
	max_lines=3,
	interactive=False
	)
	download_link = gr.HTML(
	label="Download Predictions",
	value=""
	)

	# Right column for predictions, taking more space
	with gr.Column(scale=2):
	predictions_df = gr.DataFrame(
	headers=[
	'SKU',
	'MfrID',
	'Predicted_Class',
	'Confidence CLass',
	'Confidence Score',
	'Second_Best_Class',
	'Second_Best_Score'
	],
	label="Predictions",
	value=None,
	wrap=True,
	height=600,
	interactive=False
	)

	# Diagnostics Tab
	with gr.Tab("Diagnostics"):
	with gr.Row():
	diagnostic_file = gr.File(
	label="Upload JSON for Analysis",
	file_types=['.json']
	)
	analyze_btn = gr.Button("Run Diagnostic Analysis", variant="secondary")

	diagnostic_output = gr.Textbox(
	label="Diagnostic Results",
	lines=20,
	max_lines=30
	)

	# Help Tab
	with gr.Tab("Help"):
	gr.Markdown("""
	### Using the Prediction Model
	1. Upload a JSON file containing items to classify
	2. Adjust the confidence threshold if needed
	3. Click "Make Predictions" to get results
	4. Download results using the provided link

	### Understanding the Results
	- SKU: Unique identifier for the item
	- MfrID: Manufacturer ID
	- Predicted_Class: Primary classification prediction
	- Review: Final classification or "Needs Human Review"
	- Confidence Score: Confidence level of primary prediction
	- Second_Best_Class: Alternative classification
	- Second_Best_Score: Confidence level of alternative
	""")

	# Updated CSS for better layout and readability
	demo.css = """
	/* General layout improvements */
	.container {
	max-width: 100% !important;
	padding: 0 !important;
	margin: 0 !important;
	}

	/* DataFrame styling */
	.table-wrap {
	overflow: visible !important;
	max-height: none !important;
	border: none !important;
	}

	table {
	width: 100% !important;
	border-collapse: collapse !important;
	}

	th {
	background: #f3f4f6 !important;
	padding: 12px 8px !important;
	text-align: left !important;
	font-weight: 600 !important;
	color: #374151 !important;
	border-bottom: 2px solid #d1d5db !important;
	}

	td {
	padding: 12px 8px !important;
	border-bottom: 1px solid #e5e7eb !important;
	color: #4b5563 !important;
	}

	tr:nth-child(even) {
	background-color: #f9fafb !important;
	}

	/* Button styling */
	button.primary {
	background-color: #2563eb !important;
	color: white !important;
	padding: 10px 20px !important;
	border-radius: 6px !important;
	font-weight: 500 !important;
	}

	/* Validation output styling */
	.validation-box {
	background-color: #f3f4f6 !important;
	border: 1px solid #e5e7eb !important;
	border-radius: 6px !important;
	padding: 12px !important;
	margin: 12px 0 !important;
	}

	/* Download link styling */
	.download-link {
	margin: 12px 0 !important;
	padding: 8px !important;
	text-align: center !important;
	background-color: #f3f4f6 !important;
	border-radius: 6px !important;
	}

	/* Tab styling */
	.tab-nav {
	border-bottom: 2px solid #e5e7eb !important;
	padding-bottom: 0 !important;
	}

	.tab-nav button {
	padding: 12px 20px !important;
	margin-right: 4px !important;
	border-radius: 6px 6px 0 0 !important;
	}

	.tab-nav button.selected {
	background-color: #f3f4f6 !important;
	border-bottom: 2px solid #2563eb !important;
	}
	"""

	# Set up event handlers
	predict_btn.click(
	fn=predict_and_display,
	inputs=[input_file, confidence],
	outputs=[predictions_df, download_link, validation_output]
	)

	analyze_btn.click(
	create_diagnostic_output,
	inputs=[diagnostic_file],
	outputs=[diagnostic_output]
	)

	return demo

	except Exception as e:
	logger.error(f"Error building interface: {str(e)}")
	raise gr.Error(f"Failed to initialize interface: {str(e)}")

	def main():
	"""Main function to run the Gradio interface"""
	try:
	demo = build_interface()
	# Launch with specific server options
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True,
	share=False
	)
	except Exception as e:
	logger.error(f"Application failed to start: {str(e)}")
	raise

	if __name__ == "__main__":
	main()