Spaces:

SpiralyzeLLC
/

ABTestPredictor

Runtime error

App Files Files Community

ABTestPredictor / app.py

nitish-spz

Mapping of grouped metadata

5b49b49 3 months ago

raw

history blame contribute delete

35.2 kB

	import os
	import json
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from PIL import Image
	import numpy as np
	import pandas as pd
	from transformers import AutoProcessor, ViTModel, AutoTokenizer, AutoModel
	from huggingface_hub import hf_hub_download
	import gradio as gr
	import pytesseract # For OCR
	import spaces
	import random
	import time
	import subprocess
	import re

	# Load environment variables from .env file (for local development)
	try:
	from dotenv import load_dotenv
	load_dotenv() # Load .env file if it exists
	print("✅ Loaded .env file for local development")
	except ImportError:
	print("ℹ️ python-dotenv not installed, using system environment variables only")

	# --- 1. Configuration (Mirrored from your scripts) ---
	# This ensures consistency with the model's training environment.
	MODEL_DIR = "model"
	MODEL_SAVE_PATH = os.path.join(MODEL_DIR, "multimodal_gated_model_2.7_GGG.pth")
	CAT_MAPPINGS_SAVE_PATH = os.path.join(MODEL_DIR, "multimodal_cat_mappings_GGG.json")

	# API Configuration - AI API calls removed, using direct categorical inputs

	# Hugging Face Model Hub Configuration
	# Point to your model repository (not the Space)
	HF_MODEL_REPO = "nitish-spz/ABTestPredictor" # Your model repository
	HF_MODEL_FILENAME = "multimodal_gated_model_2.7_GGG.pth"
	HF_MAPPINGS_FILENAME = "multimodal_cat_mappings_GGG.json"

	VISION_MODEL_NAME = "google/vit-base-patch16-224-in21k"
	TEXT_MODEL_NAME = "distilbert-base-uncased"
	MAX_TEXT_LENGTH = 512

	# Columns from testing script
	CONTROL_IMAGE_URL_COLUMN = "controlImage"
	VARIANT_IMAGE_URL_COLUMN = "variantImage"

	CATEGORICAL_FEATURES = [
	"Business Model", "Customer Type", "grouped_conversion_type",
	"grouped_industry", "grouped_page_type"
	]
	CATEGORICAL_EMBEDDING_DIMS = {
	"Business Model": 10, "Customer Type": 10, "grouped_conversion_type": 25,
	"grouped_industry": 50, "grouped_page_type": 25
	}
	GATED_FUSION_DIM = 64

	# --- 2. Model Architecture (Exact Replica from your training script) ---
	# This class must be defined to load the saved model weights correctly.
	class SupervisedSiameseMultimodal(nn.Module):
	"""
	Updated model architecture matching the new GGG version.
	Includes fusion block, BatchNorm, and enhanced directional features.
	"""
	def __init__(self, vision_model_name, text_model_name, cat_mappings, cat_embedding_dims):
	super().__init__()
	self.vision_model = ViTModel.from_pretrained(vision_model_name)
	self.text_model = AutoModel.from_pretrained(text_model_name)

	vision_dim = self.vision_model.config.hidden_size
	text_dim = self.text_model.config.hidden_size

	self.embedding_layers = nn.ModuleList()
	total_cat_emb_dim = 0
	for feature in CATEGORICAL_FEATURES:
	# Safely handle cases where a feature might not be in mappings
	if feature in cat_mappings:
	num_cats = cat_mappings[feature]['num_categories']
	emb_dim = cat_embedding_dims[feature]
	self.embedding_layers.append(nn.Embedding(num_cats, emb_dim))
	total_cat_emb_dim += emb_dim

	self.gate_controller = nn.Sequential(
	nn.Linear(total_cat_emb_dim, GATED_FUSION_DIM),
	nn.ReLU(),
	nn.Linear(GATED_FUSION_DIM, 2)
	)

	# Updated in_dim calculation to match new architecture
	in_dim = (vision_dim * 4) + (text_dim * 4) + total_cat_emb_dim + 2

	# Add the fusion block
	self.fusion_block = nn.Sequential(
	nn.Linear(in_dim, in_dim),
	nn.ReLU(),
	nn.Dropout(0.2)
	)

	# Updated prediction head with BatchNorm
	self.prediction_head = nn.Sequential(
	nn.BatchNorm1d(in_dim),
	nn.Linear(in_dim, vision_dim),
	nn.GELU(),
	nn.LayerNorm(vision_dim),
	nn.Dropout(0.2),
	nn.Linear(vision_dim, vision_dim // 2),
	nn.GELU(),
	nn.LayerNorm(vision_dim // 2),
	nn.Dropout(0.1),
	nn.Linear(vision_dim // 2, 1)
	)

	def forward(self, c_pix, v_pix, c_tok, c_attn, v_tok, v_attn, cat_feats):
	# Enhanced forward pass with directional features
	emb_c_vision = self.vision_model(pixel_values=c_pix).pooler_output
	emb_v_vision = self.vision_model(pixel_values=v_pix).pooler_output
	direction_feat_vision = torch.cat([emb_c_vision - emb_v_vision, emb_v_vision - emb_c_vision], dim=1)

	c_text_out = self.text_model(input_ids=c_tok, attention_mask=c_attn).last_hidden_state
	v_text_out = self.text_model(input_ids=v_tok, attention_mask=v_attn).last_hidden_state
	emb_c_text = c_text_out.mean(dim=1)
	emb_v_text = v_text_out.mean(dim=1)
	direction_feat_text = torch.cat([emb_c_text - emb_v_text, emb_v_text - emb_c_text], dim=1)

	cat_embeddings = [layer(cat_feats[:, i]) for i, layer in enumerate(self.embedding_layers)]
	final_cat_embedding = torch.cat(cat_embeddings, dim=1)

	gates = F.softmax(self.gate_controller(final_cat_embedding), dim=-1)
	vision_gate = gates[:, 0].unsqueeze(1)
	text_gate = gates[:, 1].unsqueeze(1)

	weighted_vision = direction_feat_vision * vision_gate
	weighted_text = direction_feat_text * text_gate

	batch_size = c_pix.shape[0]
	role_embedding = torch.tensor([[1, 0]] * batch_size, dtype=torch.float32, device=c_pix.device)

	final_vector = torch.cat([
	emb_c_vision, emb_v_vision,
	emb_c_text, emb_v_text,
	weighted_vision, weighted_text,
	final_cat_embedding,
	role_embedding
	], dim=1)

	# Pass through the fusion block before the final prediction head
	fused_vector = self.fusion_block(final_vector)

	return self.prediction_head(fused_vector).squeeze(-1)

	# --- 3. Loading Models and Processors (Done once on startup) ---
	# Optimized for L4 GPU setup
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"🚀 Using device: {device}")
	if torch.cuda.is_available():
	print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
	print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
	# AGGRESSIVE optimizations for 4x L4 GPU
	torch.backends.cudnn.benchmark = True
	torch.backends.cudnn.enabled = True
	torch.backends.cudnn.deterministic = False # Allow non-deterministic for speed
	# Aggressive memory management
	torch.cuda.empty_cache()
	# Enable tensor core usage for maximum performance
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	# Create dummy files if they don't exist for the app to run
	if not os.path.exists(MODEL_DIR):
	os.makedirs(MODEL_DIR)

	if not os.path.exists(CAT_MAPPINGS_SAVE_PATH):
	print(f"⚠️ GGG Category mappings not found. Creating default mappings...")
	# Create the standard category mappings expected by the model
	default_mappings = {
	"Business Model": {"num_categories": 4, "categories": ["E-Commerce", "Lead Generation", "Other*", "SaaS"]},
	"Customer Type": {"num_categories": 4, "categories": ["B2B", "B2C", "Both", "Other*"]},
	"grouped_conversion_type": {"num_categories": 6, "categories": ["Direct Purchase", "High-Intent Lead Gen", "Info/Content Lead Gen", "Location Search", "Non-Profit/Community", "Other Conversion"]},
	"grouped_industry": {"num_categories": 14, "categories": ["Automotive & Transportation", "B2B Services", "B2B Software & Tech", "Consumer Services", "Consumer Software & Apps", "Education", "Finance, Insurance & Real Estate", "Food, Hospitality & Travel", "Health & Wellness", "Industrial & Manufacturing", "Media & Entertainment", "Non-Profit & Government", "Other", "Retail & E-commerce"]},
	"grouped_page_type": {"num_categories": 5, "categories": ["Awareness & Discovery", "Consideration & Evaluation", "Conversion", "Internal & Navigation", "Post-Conversion & Other"]}
	}
	with open(CAT_MAPPINGS_SAVE_PATH, 'w') as f:
	json.dump(default_mappings, f, indent=2)
	print(f"✅ Created default category mappings at {CAT_MAPPINGS_SAVE_PATH}")

	with open(CAT_MAPPINGS_SAVE_PATH, 'r') as f:
	category_mappings = json.load(f)

	# Load mapping.json for converting specific values to parent groups
	def load_value_mappings():
	"""Load mapping.json for converting industry, page_type, and conversion_type to parent groups"""
	try:
	script_dir = os.path.dirname(os.path.abspath(__file__))
	mapping_file = os.path.join(script_dir, 'mapping.json')

	print(f"📂 Looking for mapping file at: {mapping_file}")

	if not os.path.exists(mapping_file):
	print(f"⚠️ Mapping file not found, trying fallback location...")
	mapping_file = 'mapping.json'

	with open(mapping_file, 'r') as f:
	mapping_data = json.load(f)
	print(f"✅ Successfully loaded mapping.json with {len(mapping_data)} mapping types")
	return mapping_data
	except Exception as e:
	print(f"⚠️ Error loading mapping.json: {e}")
	import traceback
	traceback.print_exc()
	return {}

	def convert_to_parent_group(value, mapping_type, value_mappings):
	"""
	Convert a specific value to its parent group using mapping.json

	Args:
	value: The specific value (e.g., "Accounting Services")
	mapping_type: Type of mapping ("industry_mappings", "page_type_mappings", "conversion_type_mappings")
	value_mappings: The loaded mapping.json data

	Returns:
	The parent group name (e.g., "B2B Services")
	"""
	if mapping_type not in value_mappings:
	print(f"⚠️ Mapping type '{mapping_type}' not found in mapping.json")
	return value

	mappings = value_mappings[mapping_type]

	# Search for the value in all parent groups
	for parent_group, child_values in mappings.items():
	if value in child_values:
	print(f"✅ Mapped '{value}' -> '{parent_group}'")
	return parent_group

	# If not found, check if the value itself is a parent group
	if value in mappings.keys():
	print(f"ℹ️ '{value}' is already a parent group")
	return value

	print(f"⚠️ Value '{value}' not found in {mapping_type}, returning as-is")
	return value

	# Load confidence scores directly from JSON file
	def load_confidence_scores():
	"""Load confidence scores from confidence_scores.json"""
	try:
	# Get the directory where this script is located
	script_dir = os.path.dirname(os.path.abspath(__file__))
	confidence_file = os.path.join(script_dir, 'confidence_scores.json')

	print(f"📂 Script directory: {script_dir}")
	print(f"📂 Looking for confidence file at: {confidence_file}")
	print(f"📂 File exists: {os.path.exists(confidence_file)}")

	if not os.path.exists(confidence_file):
	print(f"⚠️ Confidence file not found, trying fallback location...")
	# Try current directory as fallback
	confidence_file = 'confidence_scores.json'
	print(f"📂 Fallback path: {confidence_file}")
	print(f"📂 Fallback exists: {os.path.exists(confidence_file)}")

	with open(confidence_file, 'r') as f:
	confidence_data = json.load(f)
	print(f"✅ Successfully loaded {len(confidence_data)} confidence score combinations")

	# Print a sample to verify data
	sample_key = list(confidence_data.keys())[0] if confidence_data else None
	if sample_key:
	print(f"📊 Sample entry: {sample_key} = {confidence_data[sample_key]}")

	return confidence_data
	except FileNotFoundError as e:
	print(f"❌ Confidence file not found: {e}")
	print(f"📂 Current working directory: {os.getcwd()}")
	print(f"📂 Files in script dir: {os.listdir(script_dir) if os.path.exists(script_dir) else 'N/A'}")
	return {}
	except Exception as e:
	print(f"⚠️ Error loading confidence scores: {e}")
	import traceback
	traceback.print_exc()
	return {}

	# Load value mappings for converting specific values to parent groups
	try:
	print("=" * 50)
	print("🚀 LOADING VALUE MAPPINGS...")
	print("=" * 50)
	value_mappings = load_value_mappings()
	print(f"✅ Value mappings loaded successfully")
	print("=" * 50)
	except Exception as e:
	print(f"⚠️ Error loading value mappings: {e}")
	value_mappings = {}

	# Load confidence scores
	try:
	print("=" * 50)
	print("🚀 LOADING CONFIDENCE SCORES...")
	print("=" * 50)
	confidence_scores = load_confidence_scores()
	print(f"✅ Confidence scores loaded successfully: {len(confidence_scores)} combinations")
	print(f"🔍 confidence_scores is empty: {len(confidence_scores) == 0}")
	print(f"🔍 confidence_scores type: {type(confidence_scores)}")
	print("=" * 50)
	except Exception as e:
	print(f"⚠️ Error loading confidence scores: {e}")
	confidence_scores = {}
	print(f"❌ confidence_scores set to empty dict: {confidence_scores}")

	def get_confidence_data(business_model, customer_type, conversion_type, industry, page_type):
	"""Get confidence data based on Industry + Page Type combination (more reliable than 5-feature combinations)"""
	key = f"{industry}\|{page_type}"
	print(f"🔍 Looking for confidence key: '{key}'")
	print(f"🔍 Total confidence_scores loaded: {len(confidence_scores)}")
	print(f"🔍 Key exists: {key in confidence_scores}")

	if key in confidence_scores:
	data = confidence_scores[key]
	print(f"✅ Found confidence data: {data}")
	return data
	else:
	print(f"⚠️ Key '{key}' not found, using fallback")
	print(f"🔍 Available keys with '{industry}': {[k for k in confidence_scores.keys() if industry in k]}")
	return {
	'accuracy': 0.5, # Default fallback
	'count': 0,
	'training_data_count': 0,
	'correct_predictions': 0,
	'actual_wins': 0,
	'predicted_wins': 0
	}

	# Instantiate the model with the loaded mappings
	model = SupervisedSiameseMultimodal(
	VISION_MODEL_NAME, TEXT_MODEL_NAME, category_mappings, CATEGORICAL_EMBEDDING_DIMS
	)

	# Download model from Hugging Face Model Hub
	def download_model_from_hub():
	"""Download model and mappings from Hugging Face Model Hub"""
	try:
	print(f"📥 Downloading GGG model from Hugging Face Model Hub: {HF_MODEL_REPO}")

	# Download model file
	model_path = hf_hub_download(
	repo_id=HF_MODEL_REPO,
	filename=HF_MODEL_FILENAME,
	cache_dir=MODEL_DIR
	)
	print(f"✅ Model downloaded to: {model_path}")

	# Download category mappings if not exists locally
	if not os.path.exists(CAT_MAPPINGS_SAVE_PATH):
	try:
	mappings_path = hf_hub_download(
	repo_id=HF_MODEL_REPO,
	filename=HF_MAPPINGS_FILENAME,
	cache_dir=MODEL_DIR
	)
	print(f"✅ Category mappings downloaded to: {mappings_path}")

	# Copy to expected location
	import shutil
	shutil.copy(mappings_path, CAT_MAPPINGS_SAVE_PATH)
	except Exception as e:
	print(f"⚠️ Could not download mappings from hub: {e}")

	return model_path

	except Exception as e:
	print(f"⚠️ Error downloading from Model Hub: {e}")
	print(f"🔧 Creating dummy weights for demo...")
	torch.save(model.state_dict(), MODEL_SAVE_PATH)
	return MODEL_SAVE_PATH

	# Use local model if available, otherwise download from hub
	if os.path.exists(MODEL_SAVE_PATH):
	model_path = MODEL_SAVE_PATH
	print(f"✅ Using local GGG model at {MODEL_SAVE_PATH}")
	else:
	print(f"📥 Model not found locally, downloading from Model Hub...")
	model_path = download_model_from_hub()

	# Load the weights
	try:
	print(f"🚀 Loading GGG model weights from {model_path}")
	state_dict = torch.load(model_path, map_location=device)
	model.load_state_dict(state_dict)
	print("✅ Successfully loaded GGG model weights from Hugging Face Model Hub")
	except Exception as e:
	print(f"⚠️ Error loading model weights: {e}")
	print("🔧 Using initialized weights for demo...")

	model.to(device)
	model.eval()

	# Warm up the model with a dummy forward pass for better performance
	if torch.cuda.is_available():
	with torch.no_grad():
	dummy_c_pix = torch.randn(1, 3, 224, 224).to(device)
	dummy_v_pix = torch.randn(1, 3, 224, 224).to(device)
	dummy_c_tok = torch.randint(0, 1000, (1, MAX_TEXT_LENGTH)).to(device)
	dummy_c_attn = torch.ones(1, MAX_TEXT_LENGTH).to(device)
	dummy_v_tok = torch.randint(0, 1000, (1, MAX_TEXT_LENGTH)).to(device)
	dummy_v_attn = torch.ones(1, MAX_TEXT_LENGTH).to(device)
	dummy_cat_feats = torch.randint(0, 2, (1, len(CATEGORICAL_FEATURES))).to(device)

	_ = model(
	c_pix=dummy_c_pix, v_pix=dummy_v_pix,
	c_tok=dummy_c_tok, c_attn=dummy_c_attn,
	v_tok=dummy_v_tok, v_attn=dummy_v_attn,
	cat_feats=dummy_cat_feats
	)
	print("🔥 Model warmed up successfully!")

	# Load the processors for images and text
	image_processor = AutoProcessor.from_pretrained(VISION_MODEL_NAME)
	tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)

	print("✅ Model and processors loaded successfully.")


	# --- 4. Prediction Functions ---

	def get_image_path_from_url(image_url: str, base_dir: str) -> str \| None:
	"""Constructs a local image path from a URL-like string."""
	try:
	stem = os.path.splitext(os.path.basename(str(image_url)))[0]
	return os.path.join(base_dir, f"{stem}.jpeg")
	except (TypeError, ValueError):
	return None

	@spaces.GPU(duration=50) # Maximum allowed duration on free tier
	def predict_with_categorical_data(control_image, variant_image, business_model, customer_type, conversion_type, industry, page_type):
	"""Make prediction with provided categorical data (no AI API calls)"""
	if control_image is None or variant_image is None:
	return {"error": "Please provide both control and variant images"}

	start_time = time.time()

	print(f"📋 Original categories from API: {business_model} \| {customer_type} \| {conversion_type} \| {industry} \| {page_type}")

	# Convert specific values to parent groups using mapping.json
	grouped_conversion_type = convert_to_parent_group(conversion_type, "conversion_type_mappings", value_mappings)
	grouped_industry = convert_to_parent_group(industry, "industry_mappings", value_mappings)
	grouped_page_type = convert_to_parent_group(page_type, "page_type_mappings", value_mappings)

	print(f"📋 Mapped to parent groups: {business_model} \| {customer_type} \| {grouped_conversion_type} \| {grouped_industry} \| {grouped_page_type}")

	# Run the prediction with grouped categorical data
	prediction_result = predict_single(control_image, variant_image, business_model, customer_type, grouped_conversion_type, grouped_industry, grouped_page_type)

	# Create comprehensive result with prediction and confidence data
	result = {
	"predictionResults": prediction_result,
	"providedCategories": {
	"businessModel": business_model,
	"customerType": customer_type,
	"conversionType": conversion_type,
	"industry": industry,
	"pageType": page_type
	},
	"groupedCategories": {
	"businessModel": business_model,
	"customerType": customer_type,
	"conversionType": grouped_conversion_type,
	"industry": grouped_industry,
	"pageType": grouped_page_type
	},
	"processingInfo": {
	"totalProcessingTime": f"{time.time() - start_time:.2f}s",
	"confidenceSource": f"{grouped_industry} \| {grouped_page_type}"
	}
	}

	return result

	@spaces.GPU(duration=60) # Maximum allowed duration on free tier
	def predict_single(control_image, variant_image, business_model, customer_type, conversion_type, industry, page_type):
	"""
	Orchestrates the prediction for a single pair of images and features.

	Note: This function expects GROUPED values for conversion_type, industry, and page_type.
	If calling from API, use predict_with_categorical_data() which handles the conversion automatically.
	"""
	try:
	if control_image is None or variant_image is None:
	return {"Error": 1.0, "Please upload both images": 0.0}

	start_time = time.time()
	print(f"🔍 Starting prediction with categories: {business_model} \| {customer_type} \| {conversion_type} \| {industry} \| {page_type}")

	c_img = Image.fromarray(control_image).convert("RGB")
	v_img = Image.fromarray(variant_image).convert("RGB")

	# Extract OCR text from both images (this is crucial for model performance)
	try:
	c_text_str = pytesseract.image_to_string(c_img)
	v_text_str = pytesseract.image_to_string(v_img)
	print(f"📝 OCR extracted - Control: {len(c_text_str)} chars, Variant: {len(v_text_str)} chars")
	except pytesseract.TesseractNotFoundError:
	print("🛑 Tesseract is not installed or not in your PATH. Skipping OCR.")
	c_text_str, v_text_str = "", ""

	# Get confidence data for this combination
	confidence_data = get_confidence_data(business_model, customer_type, conversion_type, industry, page_type)
	print(f"📊 Confidence data loaded: {confidence_data}")

	with torch.no_grad():
	c_pix = image_processor(images=c_img, return_tensors="pt").pixel_values.to(device)
	v_pix = image_processor(images=v_img, return_tensors="pt").pixel_values.to(device)

	# Process OCR text through the text model
	c_text = tokenizer(c_text_str, padding='max_length', truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors='pt').to(device)
	v_text = tokenizer(v_text_str, padding='max_length', truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors='pt').to(device)

	cat_inputs = [business_model, customer_type, conversion_type, industry, page_type]
	cat_codes = [category_mappings[name]['categories'].index(val) for name, val in zip(CATEGORICAL_FEATURES, cat_inputs)]
	cat_feats = torch.tensor([cat_codes], dtype=torch.int64).to(device)

	# Run the multimodal model prediction
	logits = model(
	c_pix=c_pix, v_pix=v_pix,
	c_tok=c_text['input_ids'], c_attn=c_text['attention_mask'],
	v_tok=v_text['input_ids'], v_attn=v_text['attention_mask'],
	cat_feats=cat_feats
	)

	probability = torch.sigmoid(logits).item()

	processing_time = time.time() - start_time

	# Log GPU memory usage for monitoring
	if torch.cuda.is_available():
	gpu_memory = torch.cuda.memory_allocated() / 1024**3
	print(f"🚀 Prediction completed in {processing_time:.2f}s \| GPU Memory: {gpu_memory:.1f}GB")
	else:
	print(f"🚀 Prediction completed in {processing_time:.2f}s")

	# Determine winner
	winner = "VARIANT WINS" if probability > 0.5 else "CONTROL WINS"
	confidence_percentage = confidence_data['accuracy'] * 100

	# Create enhanced output with confidence scores and training data info
	result = {
	"probability": f"{probability:.3f}",
	"modelConfidence": f"{confidence_percentage:.1f}",
	"trainingDataSamples": confidence_data['training_data_count'],
	"totalPredictions": confidence_data['count'],
	"correctPredictions": confidence_data['correct_predictions'],
	"totalWinPrediction": confidence_data['actual_wins'],
	"totalLosePrediction": confidence_data['count'] - confidence_data['actual_wins']
	}

	print(f"🎯 Final result: {result}")
	return result

	except Exception as e:
	print(f"❌ ERROR in predict_single: {e}")
	print(f"🔍 Error type: {type(e).__name__}")
	import traceback
	traceback.print_exc()

	# Return error result with fallback confidence data
	return {
	"error": f"Prediction failed: {str(e)}",
	"modelConfidence": "50.0",
	"trainingDataSamples": 0,
	"totalPredictions": 0,
	"correctPredictions": 0,
	"totalWinPrediction": 0,
	"totalLosePrediction": 0
	}

	def get_all_possible_values():
	"""
	Get all possible values (both specific and grouped) for industry, page_type, and conversion_type.
	This is useful for API documentation and validation.
	"""
	all_values = {
	"industry": [],
	"page_type": [],
	"conversion_type": []
	}

	# Get all industry values (both parent groups and specific values)
	if "industry_mappings" in value_mappings:
	for parent_group, child_values in value_mappings["industry_mappings"].items():
	all_values["industry"].append(parent_group)
	all_values["industry"].extend(child_values)

	# Get all page type values
	if "page_type_mappings" in value_mappings:
	for parent_group, child_values in value_mappings["page_type_mappings"].items():
	all_values["page_type"].append(parent_group)
	all_values["page_type"].extend(child_values)

	# Get all conversion type values
	if "conversion_type_mappings" in value_mappings:
	for parent_group, child_values in value_mappings["conversion_type_mappings"].items():
	all_values["conversion_type"].append(parent_group)
	all_values["conversion_type"].extend(child_values)

	return all_values

	@spaces.GPU
	def predict_batch(csv_path, control_img_dir, variant_img_dir, num_samples):
	"""
	Handles batch prediction from a CSV file.
	Note: CSV should contain grouped values (not specific values) for:
	- grouped_conversion_type
	- grouped_industry
	- grouped_page_type
	"""
	if not all([csv_path, control_img_dir, variant_img_dir, num_samples]):
	return pd.DataFrame({"Error": ["Please fill in all fields."]})

	try:
	df = pd.read_csv(csv_path)
	except FileNotFoundError:
	return pd.DataFrame({"Error": [f"CSV file not found at: {csv_path}"]})
	except Exception as e:
	return pd.DataFrame({"Error": [f"Failed to read CSV: {e}"]})

	if num_samples > len(df):
	print(f"⚠️ Requested {num_samples} samples, but CSV only has {len(df)} rows. Using all rows.")
	num_samples = len(df)

	sample_df = df.sample(n=num_samples, random_state=42)
	results = []

	for _, row in sample_df.iterrows():
	try:
	# Construct image paths
	c_path = get_image_path_from_url(row[CONTROL_IMAGE_URL_COLUMN], control_img_dir)
	v_path = get_image_path_from_url(row[VARIANT_IMAGE_URL_COLUMN], variant_img_dir)

	if not c_path or not os.path.exists(c_path):
	raise FileNotFoundError(f"Control image not found: {c_path}")
	if not v_path or not os.path.exists(v_path):
	raise FileNotFoundError(f"Variant image not found: {v_path}")

	# Get categorical features from the row (expects grouped values in CSV)
	cat_features_from_row = [row[f] for f in CATEGORICAL_FEATURES]

	# Use the core prediction logic
	prediction = predict_single(
	control_image=np.array(Image.open(c_path)),
	variant_image=np.array(Image.open(v_path)),
	business_model=cat_features_from_row[0],
	customer_type=cat_features_from_row[1],
	conversion_type=cat_features_from_row[2],
	industry=cat_features_from_row[3],
	page_type=cat_features_from_row[4]
	)

	result_row = row.to_dict()
	result_row['predicted_win_probability'] = prediction.get('Win', 0.0)
	results.append(result_row)

	except Exception as e:
	print(f"🛑 Error processing row: {e}")
	error_row = row.to_dict()
	error_row['predicted_win_probability'] = f"ERROR: {e}"
	results.append(error_row)

	return pd.DataFrame(results)


	# --- 5. Build the Gradio Interface ---
	with gr.Blocks() as iface:
	gr.Markdown("# 🚀 Multimodal A/B Test Predictor")
	gr.Markdown("""
	### Predict A/B test outcomes using:
	- 🖼️ Image Analysis: Visual features from control & variant images
	- 📝 OCR Text Extraction: Automatically extracts and analyzes text from images
	- 📊 Categorical Features: Business context (industry, page type, etc.)
	- 🎯 Smart Confidence Scores: Based on Industry + Page Type combinations with high sample counts

	Enhanced Reliability: Confidence scores use Industry + Page Type combinations (avg 160 samples) instead of low-count 5-feature combinations!
	""")

	with gr.Tab("🎯 API Prediction"):
	gr.Markdown("### 📊 Predict with Categorical Data")
	gr.Markdown("""
	Upload images and provide categorical data for prediction.

	Note: For Industry, Page Type, and Conversion Type, you can provide either:
	- Specific values (e.g., "Accounting Services") - will be automatically converted to parent group (e.g., "B2B Services")
	- Parent group values (e.g., "B2B Services") - will be used directly

	The model uses parent groups internally, but the API accepts both for convenience.
	""")

	with gr.Row():
	with gr.Column():
	api_control_image = gr.Image(label="Control Image", type="numpy")
	api_variant_image = gr.Image(label="Variant Image", type="numpy")
	with gr.Column():
	api_business_model = gr.Dropdown(choices=category_mappings["Business Model"]['categories'], label="Business Model", value=category_mappings["Business Model"]['categories'][0])
	api_customer_type = gr.Dropdown(choices=category_mappings["Customer Type"]['categories'], label="Customer Type", value=category_mappings["Customer Type"]['categories'][0])
	api_conversion_type = gr.Dropdown(choices=category_mappings["grouped_conversion_type"]['categories'], label="Conversion Type", value=category_mappings["grouped_conversion_type"]['categories'][0])
	api_industry = gr.Dropdown(choices=category_mappings["grouped_industry"]['categories'], label="Industry", value=category_mappings["grouped_industry"]['categories'][0])
	api_page_type = gr.Dropdown(choices=category_mappings["grouped_page_type"]['categories'], label="Page Type", value=category_mappings["grouped_page_type"]['categories'][0])

	api_predict_btn = gr.Button("🎯 Predict with Categorical Data", variant="primary", size="lg")
	api_output_json = gr.JSON(label="🎯 Prediction Results with Confidence Scores")

	with gr.Tab("📋 Manual Selection"):
	gr.Markdown("### Manual Category Selection")
	gr.Markdown("Select categories manually if you prefer precise control.")

	with gr.Row():
	with gr.Column():
	s_control_image = gr.Image(label="Control Image", type="numpy")
	s_variant_image = gr.Image(label="Variant Image", type="numpy")
	with gr.Column():
	s_business_model = gr.Dropdown(choices=category_mappings["Business Model"]['categories'], label="Business Model", value=category_mappings["Business Model"]['categories'][0])
	s_customer_type = gr.Dropdown(choices=category_mappings["Customer Type"]['categories'], label="Customer Type", value=category_mappings["Customer Type"]['categories'][0])
	s_conversion_type = gr.Dropdown(choices=category_mappings["grouped_conversion_type"]['categories'], label="Conversion Type", value=category_mappings["grouped_conversion_type"]['categories'][0])
	s_industry = gr.Dropdown(choices=category_mappings["grouped_industry"]['categories'], label="Industry", value=category_mappings["grouped_industry"]['categories'][0])
	s_page_type = gr.Dropdown(choices=category_mappings["grouped_page_type"]['categories'], label="Page Type", value=category_mappings["grouped_page_type"]['categories'][0])
	s_predict_btn = gr.Button("🔮 Predict A/B Test Winner", variant="secondary")
	s_output_label = gr.Label(num_top_classes=6, label="🎯 Prediction Results & Confidence Analysis")

	with gr.Tab("Batch Prediction from CSV"):
	gr.Markdown("Provide paths to your data to get predictions for multiple random samples.")
	b_csv_path = gr.Textbox(label="Path to CSV file", placeholder="/path/to/your/data.csv")
	b_control_dir = gr.Textbox(label="Path to Control Images Folder", placeholder="/path/to/control_images/")
	b_variant_dir = gr.Textbox(label="Path to Variant Images Folder", placeholder="/path/to/variant_images/")
	b_num_samples = gr.Number(label="Number of random samples to predict", value=10)
	b_predict_btn = gr.Button("Run Batch Prediction")
	b_output_df = gr.DataFrame(label="Batch Prediction Results")

	# Wire up the components
	api_predict_btn.click(
	fn=predict_with_categorical_data,
	inputs=[api_control_image, api_variant_image, api_business_model, api_customer_type, api_conversion_type, api_industry, api_page_type],
	outputs=api_output_json
	)
	s_predict_btn.click(
	fn=predict_single,
	inputs=[s_control_image, s_variant_image, s_business_model, s_customer_type, s_conversion_type, s_industry, s_page_type],
	outputs=s_output_label
	)
	b_predict_btn.click(
	fn=predict_batch,
	inputs=[b_csv_path, b_control_dir, b_variant_dir, b_num_samples],
	outputs=b_output_df
	)

	# Launch the application
	if __name__ == "__main__":
	# AGGRESSIVE optimization for 4x L4 GPU - push to maximum limits
	iface.queue(
	max_size=128, # Much larger queue for heavy concurrent load
	default_concurrency_limit=64 # Push all 4 GPUs to maximum capacity
	).launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True # Show detailed errors for debugging
	)