Spaces:

jwrth
/

naics_embeddings

Sleeping

naics_embeddings / training /scripts /evaluate_hierarchical_embed.py

Joseph Warth

updated README

42aa4d6 about 1 month ago

13 kB

	from pathlib import Path
	import json
	import pickle
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader


	BATCH_SIZE = 64
	HIDDEN_DIM = 768
	PARENT_EMBED_DIM = 64
	DROPOUT = 0.1
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	torch.set_num_threads(1)


	class HierEmbedDataset(Dataset):
	def __init__(self, X, y_dict):
	self.X = torch.tensor(X, dtype=torch.float32)
	self.y2 = torch.tensor(y_dict["y2"], dtype=torch.long)
	self.y3 = torch.tensor(y_dict["y3"], dtype=torch.long)
	self.y4 = torch.tensor(y_dict["y4"], dtype=torch.long)
	self.y5 = torch.tensor(y_dict["y5"], dtype=torch.long)
	self.y6 = torch.tensor(y_dict["y6"], dtype=torch.long)

	def __len__(self):
	return self.X.shape[0]

	def __getitem__(self, idx):
	return (
	self.X[idx],
	self.y2[idx],
	self.y3[idx],
	self.y4[idx],
	self.y5[idx],
	self.y6[idx],
	)


	class HierarchicalEmbedMLP(nn.Module):
	def __init__(
	self,
	input_dim,
	n2,
	n3,
	n4,
	n5,
	n6,
	hidden_dim=HIDDEN_DIM,
	parent_embed_dim=PARENT_EMBED_DIM,
	dropout=DROPOUT,
	):
	super().__init__()

	self.input_block = nn.Sequential(
	nn.Linear(input_dim, hidden_dim),
	nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_dim, hidden_dim),
	nn.ReLU(),
	nn.Dropout(dropout),
	)

	self.y2_embed = nn.Embedding(n2, parent_embed_dim)
	self.y3_embed = nn.Embedding(n3, parent_embed_dim)
	self.y4_embed = nn.Embedding(n4, parent_embed_dim)
	self.y5_embed = nn.Embedding(n5, parent_embed_dim)

	self.head2 = nn.Linear(hidden_dim, n2)

	self.head3_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim)
	self.head3 = nn.Linear(hidden_dim, n3)

	self.head4_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim)
	self.head4 = nn.Linear(hidden_dim, n4)

	self.head5_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim)
	self.head5 = nn.Linear(hidden_dim, n5)

	self.head6_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim)
	self.head6 = nn.Linear(hidden_dim, n6)

	self.relu = nn.ReLU()
	self.dropout = nn.Dropout(dropout)

	def encode(self, x):
	return self.input_block(x)


	def load_npz_dict(path):
	obj = np.load(path)
	return {k: obj[k] for k in obj.files}


	def apply_mask(logits, allowed_mask):
	very_neg = torch.full_like(logits, -1e9)
	return torch.where(allowed_mask, logits, very_neg)


	def strict_decode(model, x, mask23, mask34, mask45, mask56):
	h = model.encode(x)

	logits2 = model.head2(h)
	probs2 = torch.softmax(logits2, dim=1)
	pred2 = torch.argmax(logits2, dim=1)

	h3 = torch.cat([h, model.y2_embed(pred2)], dim=1)
	h3 = model.dropout(model.relu(model.head3_hidden(h3)))
	logits3 = model.head3(h3)
	allowed3 = torch.index_select(mask23, 0, pred2)
	logits3 = apply_mask(logits3, allowed3)
	probs3 = torch.softmax(logits3, dim=1)
	pred3 = torch.argmax(logits3, dim=1)

	h4 = torch.cat([h, model.y3_embed(pred3)], dim=1)
	h4 = model.dropout(model.relu(model.head4_hidden(h4)))
	logits4 = model.head4(h4)
	allowed4 = torch.index_select(mask34, 0, pred3)
	logits4 = apply_mask(logits4, allowed4)
	probs4 = torch.softmax(logits4, dim=1)
	pred4 = torch.argmax(logits4, dim=1)

	h5 = torch.cat([h, model.y4_embed(pred4)], dim=1)
	h5 = model.dropout(model.relu(model.head5_hidden(h5)))
	logits5 = model.head5(h5)
	allowed5 = torch.index_select(mask45, 0, pred4)
	logits5 = apply_mask(logits5, allowed5)
	probs5 = torch.softmax(logits5, dim=1)
	pred5 = torch.argmax(logits5, dim=1)

	h6 = torch.cat([h, model.y5_embed(pred5)], dim=1)
	h6 = model.dropout(model.relu(model.head6_hidden(h6)))
	logits6 = model.head6(h6)
	allowed6 = torch.index_select(mask56, 0, pred5)
	logits6 = apply_mask(logits6, allowed6)
	probs6 = torch.softmax(logits6, dim=1)
	pred6 = torch.argmax(logits6, dim=1)

	k = min(5, logits6.shape[1])
	top5_idx = torch.topk(logits6, k=k, dim=1)[1]

	return {
	"pred2": pred2,
	"pred3": pred3,
	"pred4": pred4,
	"pred5": pred5,
	"pred6": pred6,
	"probs2": probs2,
	"probs3": probs3,
	"probs4": probs4,
	"probs5": probs5,
	"probs6": probs6,
	"top5_idx": top5_idx,
	}


	def idx_to_code(arr, label_map):
	return [label_map["to_value"][int(x)] for x in arr]


	def main():
	print("starting evaluation", flush=True)

	project_dir = Path(__file__).resolve().parents[2]

	processed_dir = project_dir / "data" / "processed"
	artifacts_dir = project_dir / "training" / "artifacts"
	hierarchy_dir = artifacts_dir / "hierarchy"
	label_maps_dir = artifacts_dir / "label_maps"
	embedder_dir = artifacts_dir / "embedder"
	models_dir = artifacts_dir / "models"

	predictions_dir = project_dir / "outputs" / "predictions"
	predictions_dir.mkdir(parents=True, exist_ok=True)

	grouped_dir = predictions_dir / "grouped_summaries_embed"
	grouped_dir.mkdir(parents=True, exist_ok=True)

	X_test = np.load(processed_dir / "X_test_embed.npy")
	y_test = load_npz_dict(processed_dir / "y_test_embed.npz")
	test_ref = pd.read_csv(processed_dir / "test_embed_reference.csv")

	with open(hierarchy_dir / "hierarchy_embed.pkl", "rb") as f:
	hierarchy = pickle.load(f)

	with open(label_maps_dir / "label_maps_embed.pkl", "rb") as f:
	label_maps = pickle.load(f)

	with open(embedder_dir / "embed_metadata.pkl", "rb") as f:
	embed_metadata = pickle.load(f)

	input_dim = int(X_test.shape[1])
	n2 = len(label_maps["y2"]["classes"])
	n3 = len(label_maps["y3"]["classes"])
	n4 = len(label_maps["y4"]["classes"])
	n5 = len(label_maps["y5"]["classes"])
	n6 = len(label_maps["y6"]["classes"])

	test_ds = HierEmbedDataset(X_test, y_test)
	test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

	mask23 = torch.tensor(hierarchy["mask23"], dtype=torch.bool).to(DEVICE)
	mask34 = torch.tensor(hierarchy["mask34"], dtype=torch.bool).to(DEVICE)
	mask45 = torch.tensor(hierarchy["mask45"], dtype=torch.bool).to(DEVICE)
	mask56 = torch.tensor(hierarchy["mask56"], dtype=torch.bool).to(DEVICE)

	model = HierarchicalEmbedMLP(
	input_dim=input_dim,
	n2=n2,
	n3=n3,
	n4=n4,
	n5=n5,
	n6=n6,
	).to(DEVICE)

	model_path = models_dir / "hierarchical_embed_best.pt"
	model.load_state_dict(torch.load(model_path, map_location=DEVICE))
	model.eval()

	all_pred2 = []
	all_pred3 = []
	all_pred4 = []
	all_pred5 = []
	all_pred6 = []
	all_prob2 = []
	all_prob3 = []
	all_prob4 = []
	all_prob5 = []
	all_prob6 = []
	all_top5 = []

	with torch.no_grad():
	for batch in test_loader:
	x, y2, y3, y4, y5, y6 = batch
	x = x.to(DEVICE)

	out = strict_decode(model, x, mask23, mask34, mask45, mask56)

	all_pred2.extend(out["pred2"].cpu().numpy().tolist())
	all_pred3.extend(out["pred3"].cpu().numpy().tolist())
	all_pred4.extend(out["pred4"].cpu().numpy().tolist())
	all_pred5.extend(out["pred5"].cpu().numpy().tolist())
	all_pred6.extend(out["pred6"].cpu().numpy().tolist())

	all_prob2.extend(torch.max(out["probs2"], dim=1)[0].cpu().numpy().tolist())
	all_prob3.extend(torch.max(out["probs3"], dim=1)[0].cpu().numpy().tolist())
	all_prob4.extend(torch.max(out["probs4"], dim=1)[0].cpu().numpy().tolist())
	all_prob5.extend(torch.max(out["probs5"], dim=1)[0].cpu().numpy().tolist())
	all_prob6.extend(torch.max(out["probs6"], dim=1)[0].cpu().numpy().tolist())

	all_top5.extend(out["top5_idx"].cpu().numpy().tolist())

	pred_y2 = idx_to_code(all_pred2, label_maps["y2"])
	pred_y3 = idx_to_code(all_pred3, label_maps["y3"])
	pred_y4 = idx_to_code(all_pred4, label_maps["y4"])
	pred_y5 = idx_to_code(all_pred5, label_maps["y5"])
	pred_y6 = idx_to_code(all_pred6, label_maps["y6"])

	pred_top5_y6_codes = []
	for row in all_top5:
	pred_top5_y6_codes.append(
	[label_maps["y6"]["to_value"][int(i)] for i in row]
	)

	out_df = test_ref.copy()

	out_df["pred_y2"] = pred_y2
	out_df["pred_y3"] = pred_y3
	out_df["pred_y4"] = pred_y4
	out_df["pred_y5"] = pred_y5
	out_df["pred_y6"] = pred_y6

	out_df["pred_prob_y2"] = all_prob2
	out_df["pred_prob_y3"] = all_prob3
	out_df["pred_prob_y4"] = all_prob4
	out_df["pred_prob_y5"] = all_prob5
	out_df["pred_prob_y6"] = all_prob6

	out_df["match_y2"] = (out_df["y2"].astype(str) == out_df["pred_y2"].astype(str)).astype(int)
	out_df["match_y3"] = (out_df["y3"].astype(str) == out_df["pred_y3"].astype(str)).astype(int)
	out_df["match_y4"] = (out_df["y4"].astype(str) == out_df["pred_y4"].astype(str)).astype(int)
	out_df["match_y5"] = (out_df["y5"].astype(str) == out_df["pred_y5"].astype(str)).astype(int)
	out_df["match_y6"] = (out_df["y6"].astype(str) == out_df["pred_y6"].astype(str)).astype(int)

	out_df["pred_top5_y6"] = [" \| ".join(x) for x in pred_top5_y6_codes]
	out_df["top5_contains_true_y6"] = [
	int(str(true_y6) in top5_list)
	for true_y6, top5_list in zip(out_df["y6"].astype(str), pred_top5_y6_codes)
	]

	row_path = predictions_dir / "hierarchical_embed_test_predictions.csv"
	out_df.to_csv(row_path, index=False)

	summary = {
	"embedder_model_name": embed_metadata["model_name"],
	"embedding_dim": int(embed_metadata["embedding_dim"]),
	"acc_y2": float(out_df["match_y2"].mean()),
	"acc_y3": float(out_df["match_y3"].mean()),
	"acc_y4": float(out_df["match_y4"].mean()),
	"acc_y5": float(out_df["match_y5"].mean()),
	"acc_y6": float(out_df["match_y6"].mean()),
	"top5_y6": float(out_df["top5_contains_true_y6"].mean()),
	"n_test_rows": int(len(out_df)),
	}

	summary_path = predictions_dir / "hierarchical_embed_test_summary.json"
	with open(summary_path, "w") as f:
	json.dump(summary, f, indent=2)

	group_levels = ["y2", "y3", "y4", "y5", "y6"]

	for level in group_levels:
	grouped = (
	out_df
	.groupby(level, dropna=False)
	.agg(
	n_obs=(level, "size"),
	match_y2_rate=("match_y2", "mean"),
	match_y3_rate=("match_y3", "mean"),
	match_y4_rate=("match_y4", "mean"),
	match_y5_rate=("match_y5", "mean"),
	match_y6_rate=("match_y6", "mean"),
	top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"),
	)
	.reset_index()
	.sort_values(["n_obs", level], ascending=[False, True])
	)

	rate_cols = [
	"match_y2_rate",
	"match_y3_rate",
	"match_y4_rate",
	"match_y5_rate",
	"match_y6_rate",
	"top5_contains_true_y6_rate",
	]
	grouped[rate_cols] = grouped[rate_cols].round(4)

	grouped.to_csv(grouped_dir / f"summary_by_{level}.csv", index=False)

	grouped_top5 = (
	out_df
	.groupby(level, dropna=False)
	.agg(
	n_obs=(level, "size"),
	top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"),
	)
	.reset_index()
	.sort_values(["n_obs", level], ascending=[False, True])
	)
	grouped_top5["top5_contains_true_y6_rate"] = grouped_top5["top5_contains_true_y6_rate"].round(4)
	grouped_top5.to_csv(grouped_dir / f"top5_summary_by_{level}.csv", index=False)

	print("saved row-level predictions to:", row_path, flush=True)
	print("saved summary to:", summary_path, flush=True)
	print("saved grouped summaries to:", grouped_dir, flush=True)
	print("", flush=True)
	print("summary metrics:", flush=True)
	for k, v in summary.items():
	if k in ["embedder_model_name", "n_test_rows", "embedding_dim"]:
	print(f"{k}: {v}", flush=True)
	else:
	print(f"{k}: {v:.4f}", flush=True)

	print("", flush=True)
	print("sample errors:", flush=True)
	err_df = out_df[out_df["match_y6"] == 0].head(10)
	show_cols = [
	"company_name",
	"naics_2022",
	"pred_y6",
	"y2",
	"pred_y2",
	"y3",
	"pred_y3",
	"company_description",
	]
	show_cols = [c for c in show_cols if c in err_df.columns]
	print(err_df[show_cols].to_string(index=False), flush=True)


	if __name__ == "__main__":
	main()